Skip to content

Commit

Permalink
Merge pull request #59 from ncats/patch-enrich-speed
Browse files Browse the repository at this point in the history
Patch enrich speed
  • Loading branch information
johnbraisted authored Oct 13, 2023
2 parents 25ab2aa + 8dd8f0a commit de082d9
Show file tree
Hide file tree
Showing 13 changed files with 528 additions and 28 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: RaMP
Title: RaMP (Relational Database of Metabolomic Pathways)
Type: Package
Version: 2.3.2
Version: 2.3.3
License: GPL-2
Depends: R (>= 3.6.0)
Authors@R: c(
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export(getPathwayFromAnalyte)
export(getPathwayNameList)
export(getPrefixesFromAnalytes)
export(getRaMPAnalyteIntersections)
export(getReactionsForAnalytes)
export(pathwayResultsPlot)
export(plotCataNetwork)
export(rampFastCata)
Expand Down
62 changes: 41 additions & 21 deletions R/ReturnPathwaysEnrich_InputAnalytes.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,22 @@ runFisherTest <- function(analytes,
if (biospecimen == "Adipose") {
biospecimen <- "Adipose tissue"
}

# we don't need all fields from all tables joined
# Get metabolites that belong to a specific biospecimen
# query <- paste0(
# "SELECT analytehasontology.*, ontology.*, analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
# biospecimen,
# "') and ontology.rampOntologyId = analytehasontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
# )

# less data pull back
query <- paste0(
"SELECT analytehasontology.*, ontology.*, analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
"SELECT analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
biospecimen,
"') and ontology.rampOntologyId = analytehasontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
"') and analytehasontology.rampOntologyId = ontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
)

con <- connectToRaMP()
backgrounddf <- RMariaDB::dbGetQuery(con, query)
RMariaDB::dbDisconnect(con)
Expand Down Expand Up @@ -152,7 +162,7 @@ runFisherTest <- function(analytes,

# Get the total number of metabolites that are mapped to pathways in RaMP (that's the default background)
# added conditional to not pull hmdb ids
query <- "select * from analytehaspathway where pathwaySource != 'hmdb';"
query <- "select distinct rampId, pathwaySource from analytehaspathway where pathwaySource != 'hmdb';"
con <- connectToRaMP()
allids <- RMariaDB::dbGetQuery(con, query)

Expand Down Expand Up @@ -204,9 +214,13 @@ runFisherTest <- function(analytes,
# Loop through each pathway, build the contingency table, and calculate Fisher's Exact
# test p-value
pval <- totinpath <- userinpath <- pidused <- c()

pidCount = 0

for (i in pid) {
ids_inpath <- pathwaydf[which(pathwaydf$pathwayRampId == i), "rampId"]

pidCount = pidCount + 1

if (analyte_type == "metabolites") {
# Check to make sure that this pathway does have metabolites
Expand All @@ -219,13 +233,15 @@ runFisherTest <- function(analytes,
bg_in_pathway <- length(unique(grep("RAMP_C", ids_inpath_bg, value = TRUE)))
}
}
inputkegg <- segregated_id_list[[1]][1][[1]]
inputreact <- segregated_id_list[[1]][2][[1]]
inputwiki <- segregated_id_list[[1]][3][[1]]
inputcustom <- segregated_id_list[[1]][[4]]
tot_user_analytes <- length(grep("RAMP_C", unique(pathwaydf$rampId)))
if (background_type != "database") {
tot_bg_analytes <- length(grep("RAMP_C", unique(backgrounddf$rampId)))
if(pidCount == 1) {
inputkegg <- segregated_id_list[[1]][1][[1]]
inputreact <- segregated_id_list[[1]][2][[1]]
inputwiki <- segregated_id_list[[1]][3][[1]]
inputcustom <- segregated_id_list[[1]][[4]]
tot_user_analytes <- length(grep("RAMP_C", unique(pathwaydf$rampId)))
if (background_type != "database") {
tot_bg_analytes <- length(grep("RAMP_C", unique(backgrounddf$rampId)))
}
}
} else { # if genes
# Check to make sure that this pathway does have genes
Expand All @@ -234,11 +250,13 @@ runFisherTest <- function(analytes,
} else {
user_in_pathway <- length(unique(grep("RAMP_G", ids_inpath, value = TRUE)))
}
inputkegg <- segregated_id_list[[2]][1][[1]]
inputreact <- segregated_id_list[[2]][2][[1]]
inputwiki <- segregated_id_list[[2]][3][[1]]
inputcustom <- segregated_id_list[[2]][[4]]
tot_user_analytes <- length(grep("RAMP_G", unique(pathwaydf$rampId)))
if(pidCount == 1) {
inputkegg <- segregated_id_list[[2]][1][[1]]
inputreact <- segregated_id_list[[2]][2][[1]]
inputwiki <- segregated_id_list[[2]][3][[1]]
inputcustom <- segregated_id_list[[2]][[4]]
tot_user_analytes <- length(grep("RAMP_G", unique(pathwaydf$rampId)))
}
## tot_bg_analytes <- length(grep("RAMP_G", unique(backgrounddf$rampId)))
}
if ((!is.na(inputkegg$pathwayRampId[1])) && i %in% inputkegg$pathwayRampId) {
Expand Down Expand Up @@ -875,12 +893,14 @@ getPathwayFromAnalyte <- function(analytes = "none",
return(df2)
}


##' @param analytes a vector of analytes (genes or metabolites) that need to be searched
##' @param pathways If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows
##' @param analyte_type "genes" or "metabolites"
##' @return A pathwaydf compatible with runFisherTest
##' @author Andrew Patt
#' Reads an excel file containing metabolite and gene annotations and filters pathways
#' to those pathways represented by the input analytes.
#'
#' @param analytes a vector of analytes (genes or metabolites) that need to be searched
#' @param pathways If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows
#' @param analyte_type "genes" or "metabolites"
#' @return A pathwaydf compatible with runFisherTest
#' @author Andrew Patt
getCustomPathwayFromAnalyte <- function(analytes, pathways, analyte_type) {
print("Starting getCustomPathwayFromAnalyte()")
tryCatch(
Expand Down
14 changes: 10 additions & 4 deletions R/plottingFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,21 @@ plotCataNetwork <- function(catalyzedf = "") {
#' @param sig_cutoff Aesthetic, shows pvalue cutoff for significant pathways
#' @param interactive If TRUE, return interactive plotly object instead of ggplot object
#' @export
pathwayResultsPlot <- function(pathwaysSig, pval = "FDR", perc_analyte_overlap = 0.2,
perc_pathway_overlap = 0.2, min_pathway_tocluster = 3,
pathwayResultsPlot <- function(pathwaysSig, pval = "FDR", perc_analyte_overlap = 0.5,
perc_pathway_overlap = 0.5, min_pathway_tocluster = 3,
text_size = 16, sig_cutoff = 0.05, interactive=FALSE) {

if( !('cluster_assignment' %in% colnames(pathwaysSig$fishresult))) {
fishClustering <- findCluster(pathwaysSig,
perc_analyte_overlap = perc_analyte_overlap,
perc_pathway_overlap = perc_pathway_overlap,
min_pathway_tocluster = min_pathway_tocluster
)
fishresult <- fishClustering$fishresults
)
fishresult <- fishClustering$fishresults
} else {
fishresult <- pathwaysSig$fishresults
}

if (pathwaysSig$analyte_type == "genes" | pathwaysSig$analyte_type == "metabolites") {
inPath <- fishresult$Num_In_Path
totPath <- fishresult$Total_In_Path
Expand Down
245 changes: 245 additions & 0 deletions R/rampReactionQueries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
# RaMP Reaction Queries

#' getReactionsForAnalytes
#'
#' @param analytes list of analytes
#' @param analyteType analyte type, 'metabolites' (default), 'genes' or 'both'
#' @param namesOrIds indicates if input analyte list contains identifiers or analyte names
#' @param onlyHumanMets boolean to only return pathways containing only human metabolites (ChEBI ontology) (dev in progress)
#' @param humanProtein boolean to only control pathways catalyzed by a human proteins (having human Uniprot) (dev in progress)
#' @param includeTransportRxns if TRUE, returns metabolic and transport reactions
#' @param rxnDirs character vector of length > 1, specifying reaction directions to return c("UN", "LR", "RL", "BD", "ALL"), default = c("UN").
#'
#' @return a list of reaction information on each input analyte, separate data.frame for metabolites, genes, and common reactions
#' @export
getReactionsForAnalytes <- function(analytes, analyteType='metabolites', namesOrIds='ids', onlyHumanMets=F, humanProtein=F, includeTransportRxns=F, rxnDirs=c("UN")) {

genes <- data.frame()
metabolites <- data.frame()
mdf <- data.frame()
gdf <- data.frame()
if(namesOrIds == 'ids') {
analyteSourceInfo <- getRampSourceInfoFromAnalyteIDs(analytes)
resultGenes <- analyteSourceInfo[analyteSourceInfo$geneOrCompound == 'gene',]
resultMets <- analyteSourceInfo[analyteSourceInfo$geneOrCompound == 'compound',]

if(!is.null(resultMets) && nrow(resultMets)>0) {

print("Retrieving reactions for compounds")

mets <- resultMets[,c('sourceId', 'rampId')]
rampIds <- unique(unlist(mets$rampId))
mdf <- getReactionsForRaMPCompoundIds(rampCompoundIds=rampIds, onlyHumanMets=onlyHumanMets, humanProtein=humanProtein, includeTransportRxns=includeTransportRxns, rxnDirs=rxnDirs)
if(nrow(mdf) > 0) {
mdf <- merge(mets, mdf, by.x='rampId', by.y='ramp_cmpd_id')
}
}

if(!is.null(resultGenes) && nrow(resultGenes)>0) {

print("Retrieving reactions for genes/proteins")

genes <- resultGenes[,c('sourceId', 'rampId')]
rampIds <- unique(unlist(genes$rampId))
gdf <- getReactionsForRaMPGeneIds(rampGeneIds=rampIds, onlyHumanMets=onlyHumanMets, humanProtein=humanProtein, includeTransportRxns)
if(nrow(gdf) > 0) {
gdf <- merge(genes, gdf, by.x='rampId', by.y='ramp_gene_id')
}
}

# if(!is.null(resultMets) && ncol(resultMets)>0) {
# metatabolites <- resultMets[,c('sourceId', 'rampId')]
# }

} else { # query on names
# do we have a helper function to get ramp ids from names?

}

resultList <- list()
resultList[['met2rxn']] <- mdf
resultList[['prot2rxn']] <- gdf
resultList[['metProteinCommonReactions']] <- data.frame()

# find common reactions
if(nrow(mdf) > 0 && nrow(gdf) > 0) {
mRampRxnIds <- unlist(mdf$ramp_rxn_id)
gRampRxnIds <- unlist(gdf$ramp_rxn_id)
commonRxnIds <- intersect(mRampRxnIds, gRampRxnIds)
if(length(commonRxnIds) > 0) {
mRxn <- mdf[mdf$ramp_rxn_id %in% commonRxnIds,]
gRxn <- gdf[gdf$ramp_rxn_id %in% commonRxnIds,]

# one row per reaction, include concat mets, concat proteins
# probably a more streamlined way to do this...
commonRampRxnList <- c()
commonRxnList <- c()
mRxnSourceIds <- c()
mRxnSourceNames <- c()
gRxnSourceIds <- c()
gRxnSourceName <- c()
gRxnUniprot <- c()
commonRxnLabel <- c()
commonRxnHTMLEq <- c()
hasHumanProtein <- c()
onlyHumanMets <- c()
isTransport <- c()
rxnDir <- c()
for(rxn in commonRxnIds) {
commonRampRxnList <- c(commonRampRxnList, rxn)
commonRxnList <- c(commonRxnList, paste0(unique(mRxn$rxn_source_id[mRxn$ramp_rxn_id == rxn]), collapse = ','))
mRxnSourceIds <- c(mRxnSourceIds, paste0(unique(mRxn$sourceId[mRxn$ramp_rxn_id == rxn]), collapse = ','))
mRxnSourceNames <- c(mRxnSourceNames, paste0(unique(mRxn$met_name[mRxn$ramp_rxn_id == rxn]), collapse = ','))

gRxnSourceIds <- c(gRxnSourceIds, paste0(unique(gRxn$sourceId[gRxn$ramp_rxn_id == rxn]), collapse = ','))
gRxnUniprot <- c(gRxnUniprot, paste0(unique(gRxn$uniprot[gRxn$ramp_rxn_id == rxn]), collapse = ','))
gRxnSourceName <- c(gRxnSourceName, paste0(unique(gRxn$protein_name[gRxn$ramp_rxn_id == rxn]), collapse = ','))

rxnDir <- c(rxnDir, paste0(unique(mRxn$direction[mRxn$ramp_rxn_id == rxn]), collapse = ','))

commonRxnLabel <- c(commonRxnLabel, paste0(unique(mRxn$label[mRxn$ramp_rxn_id == rxn]), collapse = ','))
commonRxnHTMLEq <- c(commonRxnHTMLEq, paste0(unique(mRxn$html_equation[mRxn$ramp_rxn_id == rxn]), collapse = ','))

isTransport <- c(isTransport, paste0(unique(mRxn$is_transport[mRxn$ramp_rxn_id == rxn]), collapse = ','))
hasHumanProtein <- c(hasHumanProtein, paste0(unique(mRxn$has_human_prot[mRxn$ramp_rxn_id == rxn]), collapse = ','))
onlyHumanMets <- c(onlyHumanMets, paste0(unique(mRxn$only_human_mets[mRxn$ramp_rxn_id == rxn]), collapse = ','))
}
commonReactions <- data.frame(list('metabolites' = mRxnSourceIds, 'met_names' = mRxnSourceNames ,'genes' = gRxnSourceIds,
'uniprot' = gRxnUniprot, 'proteinNames' = gRxnSourceName,
'reactionId' = commonRxnList, 'rxn_dir' = rxnDir,'rxn_label' = commonRxnLabel,
'rxn_html_label' = commonRxnHTMLEq,
'is_transport' = isTransport,
'has_human_protein' = hasHumanProtein,
'only_human_mets' = onlyHumanMets))

resultList[['metProteinCommonReactions']] <- commonReactions
}
}

return(resultList)
}

#' getReactionsForRaMPCompoundIds returns reactions for a collection of ramp compound ids
#'
#' @param rampCompoundIds list of ramp compound ids
#' @param onlyHumanMets boolean to only return pathways containing only human metabolites (ChEBI ontology) (dev in progress)
#' @param humanProtein boolean to only control pathways catalyzed by a human proteins (having human Uniprot) (dev in progress)
#' @param includeTransportRxns if TRUE, returns metabolic and transport reactions
#'
#' @return returns a dataframe of reaction information for each ramp compound id
getReactionsForRaMPCompoundIds <- function(rampCompoundIds, onlyHumanMets=F, humanProtein=F, includeTransportRxns=F, rxnDirs=c("UN")) {

idStr <- listToQueryString(rampCompoundIds)
query <- paste0("select mr.ramp_rxn_id, mr.ramp_cmpd_id, mr.met_source_id, mr.substrate_product, mr.is_cofactor, mr.met_name,
mr.ramp_rxn_id, rxn.rxn_source_id, rxn.is_transport, rxn.label, rxn.direction, rxn.equation, rxn.html_equation, rxn.ec_num, rxn.has_human_prot, rxn.only_human_mets
from reaction2met mr, reaction rxn
where mr.ramp_cmpd_id in (",idStr,") and rxn.ramp_rxn_id = mr.ramp_rxn_id")

if(length(rxnDirs) == 1) {
query <- paste0(query, " and rxn.direction = '",rxnDirs[1],"'")
} else if(length(rxnDirs)>1) {
query <- paste0(query, " and rxn.direction in (",listToQueryString(rxnDirs),")")
} else {
print("rxnDirs must be of length > 0")
}

if(humanProtein) {
query <- paste0(query," and rxn.has_human_prot = 1")
}

if(onlyHumanMets) {
query <- paste0(query, " and rxn.only_human_mets = 1")
}

if(!includeTransportRxns) {
query <- paste0(query, " and rxn.is_transport = 0")
}

conn <- connectToRaMP()
df <- RMariaDB::dbGetQuery(conn,query)
RMariaDB::dbDisconnect(conn=conn)
return(df)
}



#' getReactionsForRaMPGeneIds returns reactions for a collection of ramp compound ids
#'
#' @param rampGeneIds list of ramp compound ids
#' @param onlyHumanMets boolean to only return pathways containing only human metabolites (ChEBI ontology) (dev in progress)
#' @param humanProtein boolean to only control pathways catalyzed by a human proteins (having human Uniprot) (dev in progress)
#' @param includeTransportRxns if TRUE, returns metabolic and transport reactions
#'
#' @return returns a dataframe of reaction information for each ramp compound id
getReactionsForRaMPGeneIds <- function(rampGeneIds, onlyHumanMets=F, humanProtein=F, includeTransportRxns=F, rxnDirs=c("UN")) {

idStr <- listToQueryString(rampGeneIds)
query <- paste0("select gr.ramp_rxn_id, gr.ramp_gene_id, gr.uniprot, gr.protein_name,
gr.ramp_rxn_id, rxn.rxn_source_id, rxn.is_transport, rxn.label, rxn.direction, rxn.equation, rxn.html_equation, rxn.ec_num, rxn.has_human_prot, rxn.only_human_mets
from reaction2protein gr, reaction rxn
where gr.ramp_gene_id in (",idStr,") and rxn.ramp_rxn_id = gr.ramp_rxn_id")

if(length(rxnDirs) == 1) {
query <- paste0(query, " and rxn.direction = '",rxnDirs[1],"'")
} else if(length(rxnDirs)>1) {
query <- paste0(query, " and rxn.direction in (",listToQueryString(rxnDirs),")")
} else {
print("rxnDirs must be of length > 0")
}

if(humanProtein) {
query <- paste0(query, " and rxn.has_human_prot = 1")
}

if(onlyHumanMets) {
query <- paste0(query, " and rxn.only_human_mets = 1")
}

if(!includeTransportRxns) {
query <- paste0(query, " and rxn.is_transport = 0")
}

conn <- connectToRaMP()
df <- RMariaDB::dbGetQuery(conn,query)
RMariaDB::dbDisconnect(conn=conn)
return(df)
}


#####################################################
###################################
#
# general dev note... perhaps the functions below should be moved to rampQueryHelpers
#
##



#' getRampSourceInfoFromAnalyteIDs Utility method to extract source table information from analyte ids
#'
#' @param analytes list of analyte ids
#'
#' @return returns a dataframe of ramp analyte source information
getRampSourceInfoFromAnalyteIDs <- function(analytes) {

analyteStr <- listToQueryString(analytes)

query = paste("select distinct sourceId, rampId, geneOrCompound from source where sourceId in (",analyteStr,")")

conn <- connectToRaMP()
df <- RMariaDB::dbGetQuery(conn,query)
RMariaDB::dbDisconnect(conn=conn)

return(df)
}


#' listToQueryString utility method to convert an id list to a comma separate string, with single quoted values.
#'
#' @param analytes list of analytes (can be names or ids)
#'
#' @return comma separated list of single quoted analyte ids or names
listToQueryString <- function(analytes) {
analyteStr <- paste0("'", paste0(analytes, collapse = "','"), "'", sep="")
return (analyteStr)
}
Loading

0 comments on commit de082d9

Please sign in to comment.