Merge pull request #59 from ncats/patch-enrich-speed

Patch enrich speed
ncats · Oct 13, 2023 · de082d9 · de082d9
2 parents 25ab2aa + 8dd8f0a
commit de082d9
Show file tree

Hide file tree

Showing 13 changed files with 528 additions and 28 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: RaMP
 Title: RaMP (Relational Database of Metabolomic Pathways)
 Type: Package
-Version: 2.3.2
+Version: 2.3.3
 License: GPL-2
 Depends: R (>= 3.6.0)
 Authors@R: c(

diff --git a/NAMESPACE b/NAMESPACE
@@ -19,6 +19,7 @@ export(getPathwayFromAnalyte)
 export(getPathwayNameList)
 export(getPrefixesFromAnalytes)
 export(getRaMPAnalyteIntersections)
+export(getReactionsForAnalytes)
 export(pathwayResultsPlot)
 export(plotCataNetwork)
 export(rampFastCata)

diff --git a/R/ReturnPathwaysEnrich_InputAnalytes.R b/R/ReturnPathwaysEnrich_InputAnalytes.R
@@ -88,12 +88,22 @@ runFisherTest <- function(analytes,
       if (biospecimen == "Adipose") {
         biospecimen <- "Adipose tissue"
       }
+
+      # we don't need all fields from all tables joined
       # Get metabolites that belong to a specific biospecimen
+      # query <- paste0(
+      #   "SELECT analytehasontology.*, ontology.*, analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
+      #   biospecimen,
+      #   "') and ontology.rampOntologyId = analytehasontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
+      # )
+
+      # less data pull back
       query <- paste0(
-        "SELECT analytehasontology.*, ontology.*, analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
+        "SELECT analytehaspathway.* from analytehasontology, ontology, analytehaspathway where ontology.commonName in ('",
         biospecimen,
-        "') and ontology.rampOntologyId = analytehasontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
+        "') and analytehasontology.rampOntologyId = ontology.rampOntologyId and analytehasontology.rampCompoundId = analytehaspathway.rampId"
       )
+
       con <- connectToRaMP()
       backgrounddf <- RMariaDB::dbGetQuery(con, query)
       RMariaDB::dbDisconnect(con)
@@ -152,7 +162,7 @@ runFisherTest <- function(analytes,
 
   # Get the total number of metabolites that are mapped to pathways in RaMP (that's the default background)
   # added conditional to not pull hmdb ids
-  query <- "select * from analytehaspathway where pathwaySource != 'hmdb';"
+  query <- "select distinct rampId, pathwaySource from analytehaspathway where pathwaySource != 'hmdb';"
   con <- connectToRaMP()
   allids <- RMariaDB::dbGetQuery(con, query)
 
@@ -204,9 +214,13 @@ runFisherTest <- function(analytes,
   # Loop through each pathway, build the contingency table, and calculate Fisher's Exact
   # test p-value
   pval <- totinpath <- userinpath <- pidused <- c()
+
+  pidCount = 0
+
   for (i in pid) {
     ids_inpath <- pathwaydf[which(pathwaydf$pathwayRampId == i), "rampId"]
 
+    pidCount = pidCount + 1
 
     if (analyte_type == "metabolites") {
       # Check to make sure that this pathway does have metabolites
@@ -219,13 +233,15 @@ runFisherTest <- function(analytes,
           bg_in_pathway <- length(unique(grep("RAMP_C", ids_inpath_bg, value = TRUE)))
         }
       }
-      inputkegg <- segregated_id_list[[1]][1][[1]]
-      inputreact <- segregated_id_list[[1]][2][[1]]
-      inputwiki <- segregated_id_list[[1]][3][[1]]
-      inputcustom <- segregated_id_list[[1]][[4]]
-      tot_user_analytes <- length(grep("RAMP_C", unique(pathwaydf$rampId)))
-      if (background_type != "database") {
-        tot_bg_analytes <- length(grep("RAMP_C", unique(backgrounddf$rampId)))
+      if(pidCount == 1) {
+        inputkegg <- segregated_id_list[[1]][1][[1]]
+        inputreact <- segregated_id_list[[1]][2][[1]]
+        inputwiki <- segregated_id_list[[1]][3][[1]]
+        inputcustom <- segregated_id_list[[1]][[4]]
+        tot_user_analytes <- length(grep("RAMP_C", unique(pathwaydf$rampId)))
+        if (background_type != "database") {
+          tot_bg_analytes <- length(grep("RAMP_C", unique(backgrounddf$rampId)))
+        }
       }
     } else { # if genes
       # Check to make sure that this pathway does have genes
@@ -234,11 +250,13 @@ runFisherTest <- function(analytes,
       } else {
         user_in_pathway <- length(unique(grep("RAMP_G", ids_inpath, value = TRUE)))
       }
-      inputkegg <- segregated_id_list[[2]][1][[1]]
-      inputreact <- segregated_id_list[[2]][2][[1]]
-      inputwiki <- segregated_id_list[[2]][3][[1]]
-      inputcustom <- segregated_id_list[[2]][[4]]
-      tot_user_analytes <- length(grep("RAMP_G", unique(pathwaydf$rampId)))
+      if(pidCount == 1) {
+        inputkegg <- segregated_id_list[[2]][1][[1]]
+        inputreact <- segregated_id_list[[2]][2][[1]]
+        inputwiki <- segregated_id_list[[2]][3][[1]]
+        inputcustom <- segregated_id_list[[2]][[4]]
+        tot_user_analytes <- length(grep("RAMP_G", unique(pathwaydf$rampId)))
+      }
       ## tot_bg_analytes <- length(grep("RAMP_G", unique(backgrounddf$rampId)))
     }
     if ((!is.na(inputkegg$pathwayRampId[1])) && i %in% inputkegg$pathwayRampId) {
@@ -875,12 +893,14 @@ getPathwayFromAnalyte <- function(analytes = "none",
   return(df2)
 }
 
-
-##' @param analytes a vector of analytes (genes or metabolites) that need to be searched
-##' @param pathways If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows
-##' @param analyte_type "genes" or "metabolites"
-##' @return A pathwaydf compatible with runFisherTest
-##' @author Andrew Patt
+#' Reads an excel file containing metabolite and gene annotations and filters pathways
+#' to those pathways represented by the input analytes.
+#'
+#' @param analytes a vector of analytes (genes or metabolites) that need to be searched
+#' @param pathways If "RaMP" (default), use pathway definitions within RaMP-DB. Else, supply path to gmx file containing custom pathway definitions. GMX files are a tab-separated format that contain one analyte set per column, with the name of the set in the first row, and constituent analytes in subsequent rows
+#' @param analyte_type "genes" or "metabolites"
+#' @return A pathwaydf compatible with runFisherTest
+#' @author Andrew Patt
 getCustomPathwayFromAnalyte <- function(analytes, pathways, analyte_type) {
   print("Starting getCustomPathwayFromAnalyte()")
   tryCatch(

diff --git a/R/plottingFunctions.R b/R/plottingFunctions.R
@@ -65,15 +65,21 @@ plotCataNetwork <- function(catalyzedf = "") {
 #' @param sig_cutoff Aesthetic, shows pvalue cutoff for significant pathways
 #' @param interactive If TRUE, return interactive plotly object instead of ggplot object
 #' @export
-pathwayResultsPlot <- function(pathwaysSig, pval = "FDR", perc_analyte_overlap = 0.2,
-                                 perc_pathway_overlap = 0.2, min_pathway_tocluster = 3,
+pathwayResultsPlot <- function(pathwaysSig, pval = "FDR", perc_analyte_overlap = 0.5,
+                                 perc_pathway_overlap = 0.5, min_pathway_tocluster = 3,
                                  text_size = 16, sig_cutoff = 0.05, interactive=FALSE) {
+
+  if( !('cluster_assignment' %in% colnames(pathwaysSig$fishresult))) {
     fishClustering <- findCluster(pathwaysSig,
                                   perc_analyte_overlap = perc_analyte_overlap,
                                   perc_pathway_overlap = perc_pathway_overlap,
                                   min_pathway_tocluster = min_pathway_tocluster
-                                  )
-  fishresult <- fishClustering$fishresults
+    )
+    fishresult <- fishClustering$fishresults
+  } else {
+    fishresult <- pathwaysSig$fishresults
+  }
+
   if (pathwaysSig$analyte_type == "genes" | pathwaysSig$analyte_type == "metabolites") {
     inPath <- fishresult$Num_In_Path
     totPath <- fishresult$Total_In_Path

diff --git a/R/rampReactionQueries.R b/R/rampReactionQueries.R
@@ -0,0 +1,245 @@
+# RaMP Reaction Queries
+
+#' getReactionsForAnalytes
+#'
+#' @param analytes list of analytes
+#' @param analyteType analyte type, 'metabolites' (default), 'genes' or 'both'
+#' @param namesOrIds indicates if input analyte list contains identifiers or analyte names
+#' @param onlyHumanMets boolean to only return pathways containing only human metabolites (ChEBI ontology) (dev in progress)
+#' @param humanProtein boolean to only control pathways catalyzed by a human proteins (having human Uniprot) (dev in progress)
+#' @param includeTransportRxns if TRUE, returns metabolic and transport reactions
+#' @param rxnDirs character vector of length > 1, specifying reaction directions to return  c("UN", "LR", "RL", "BD", "ALL"), default = c("UN").
+#'
+#' @return a list of reaction information on each input analyte, separate data.frame for metabolites, genes, and common reactions
+#' @export
+getReactionsForAnalytes <- function(analytes, analyteType='metabolites', namesOrIds='ids', onlyHumanMets=F, humanProtein=F, includeTransportRxns=F, rxnDirs=c("UN")) {
+
+  genes <- data.frame()
+  metabolites <- data.frame()
+  mdf <- data.frame()
+  gdf <- data.frame()
+  if(namesOrIds == 'ids') {
+    analyteSourceInfo <- getRampSourceInfoFromAnalyteIDs(analytes)
+    resultGenes <- analyteSourceInfo[analyteSourceInfo$geneOrCompound == 'gene',]
+    resultMets <- analyteSourceInfo[analyteSourceInfo$geneOrCompound == 'compound',]
+
+    if(!is.null(resultMets) && nrow(resultMets)>0) {
+
+      print("Retrieving reactions for compounds")
+
+      mets <- resultMets[,c('sourceId', 'rampId')]
+      rampIds <- unique(unlist(mets$rampId))
+      mdf <- getReactionsForRaMPCompoundIds(rampCompoundIds=rampIds, onlyHumanMets=onlyHumanMets, humanProtein=humanProtein, includeTransportRxns=includeTransportRxns, rxnDirs=rxnDirs)
+      if(nrow(mdf) > 0) {
+        mdf <- merge(mets, mdf, by.x='rampId', by.y='ramp_cmpd_id')
+      }
+    }
+
+    if(!is.null(resultGenes) && nrow(resultGenes)>0) {
+
+      print("Retrieving reactions for genes/proteins")
+
+      genes <- resultGenes[,c('sourceId', 'rampId')]
+      rampIds <- unique(unlist(genes$rampId))
+      gdf <- getReactionsForRaMPGeneIds(rampGeneIds=rampIds, onlyHumanMets=onlyHumanMets, humanProtein=humanProtein, includeTransportRxns)
+      if(nrow(gdf) > 0) {
+        gdf <- merge(genes, gdf, by.x='rampId', by.y='ramp_gene_id')
+      }
+    }
+
+    # if(!is.null(resultMets) && ncol(resultMets)>0) {
+    #   metatabolites <- resultMets[,c('sourceId', 'rampId')]
+    # }
+
+  } else { # query on names
+    # do we have a helper function to get ramp ids from names?
+
+  }
+
+  resultList <- list()
+  resultList[['met2rxn']] <- mdf
+  resultList[['prot2rxn']] <- gdf
+  resultList[['metProteinCommonReactions']] <- data.frame()
+
+  # find common reactions
+  if(nrow(mdf) > 0 && nrow(gdf) > 0) {
+    mRampRxnIds <- unlist(mdf$ramp_rxn_id)
+    gRampRxnIds <- unlist(gdf$ramp_rxn_id)
+    commonRxnIds <- intersect(mRampRxnIds, gRampRxnIds)
+    if(length(commonRxnIds) > 0) {
+      mRxn <- mdf[mdf$ramp_rxn_id %in% commonRxnIds,]
+      gRxn <- gdf[gdf$ramp_rxn_id %in% commonRxnIds,]
+
+      # one row per reaction, include concat mets, concat proteins
+      # probably a more streamlined way to do this...
+      commonRampRxnList <- c()
+      commonRxnList <- c()
+      mRxnSourceIds <- c()
+      mRxnSourceNames <- c()
+      gRxnSourceIds <- c()
+      gRxnSourceName <- c()
+      gRxnUniprot <- c()
+      commonRxnLabel <- c()
+      commonRxnHTMLEq <- c()
+      hasHumanProtein <- c()
+      onlyHumanMets <- c()
+      isTransport <- c()
+      rxnDir <- c()
+      for(rxn in commonRxnIds) {
+        commonRampRxnList <- c(commonRampRxnList, rxn)
+        commonRxnList <- c(commonRxnList, paste0(unique(mRxn$rxn_source_id[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+        mRxnSourceIds <- c(mRxnSourceIds, paste0(unique(mRxn$sourceId[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+        mRxnSourceNames <- c(mRxnSourceNames, paste0(unique(mRxn$met_name[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+
+        gRxnSourceIds <- c(gRxnSourceIds, paste0(unique(gRxn$sourceId[gRxn$ramp_rxn_id == rxn]), collapse = ','))
+        gRxnUniprot <- c(gRxnUniprot, paste0(unique(gRxn$uniprot[gRxn$ramp_rxn_id == rxn]), collapse = ','))
+        gRxnSourceName <- c(gRxnSourceName, paste0(unique(gRxn$protein_name[gRxn$ramp_rxn_id == rxn]), collapse = ','))
+
+        rxnDir <- c(rxnDir, paste0(unique(mRxn$direction[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+
+        commonRxnLabel <- c(commonRxnLabel, paste0(unique(mRxn$label[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+        commonRxnHTMLEq <- c(commonRxnHTMLEq, paste0(unique(mRxn$html_equation[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+
+        isTransport <- c(isTransport, paste0(unique(mRxn$is_transport[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+        hasHumanProtein <- c(hasHumanProtein, paste0(unique(mRxn$has_human_prot[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+        onlyHumanMets <- c(onlyHumanMets, paste0(unique(mRxn$only_human_mets[mRxn$ramp_rxn_id == rxn]), collapse = ','))
+      }
+      commonReactions <- data.frame(list('metabolites' = mRxnSourceIds, 'met_names' = mRxnSourceNames ,'genes' = gRxnSourceIds,
+                                         'uniprot' = gRxnUniprot, 'proteinNames' = gRxnSourceName,
+                                         'reactionId' = commonRxnList, 'rxn_dir' = rxnDir,'rxn_label' = commonRxnLabel,
+                                         'rxn_html_label' = commonRxnHTMLEq,
+                                         'is_transport' = isTransport,
+                                         'has_human_protein' = hasHumanProtein,
+                                         'only_human_mets' = onlyHumanMets))
+
+      resultList[['metProteinCommonReactions']] <- commonReactions
+    }
+  }
+
+  return(resultList)
+}
+
+#' getReactionsForRaMPCompoundIds returns reactions for a collection of ramp compound ids
+#'
+#' @param rampCompoundIds list of ramp compound ids
+#' @param onlyHumanMets boolean to only return pathways containing only human metabolites (ChEBI ontology) (dev in progress)
+#' @param humanProtein boolean to only control pathways catalyzed by a human proteins (having human Uniprot) (dev in progress)
+#' @param includeTransportRxns if TRUE, returns metabolic and transport reactions
+#'
+#' @return returns a dataframe of reaction information for each ramp compound id
+getReactionsForRaMPCompoundIds <- function(rampCompoundIds, onlyHumanMets=F, humanProtein=F, includeTransportRxns=F, rxnDirs=c("UN")) {
+
+  idStr <- listToQueryString(rampCompoundIds)
+  query <- paste0("select mr.ramp_rxn_id, mr.ramp_cmpd_id, mr.met_source_id, mr.substrate_product, mr.is_cofactor, mr.met_name,
+  mr.ramp_rxn_id, rxn.rxn_source_id, rxn.is_transport, rxn.label, rxn.direction, rxn.equation, rxn.html_equation, rxn.ec_num, rxn.has_human_prot, rxn.only_human_mets
+  from reaction2met mr, reaction rxn
+  where mr.ramp_cmpd_id in (",idStr,") and rxn.ramp_rxn_id = mr.ramp_rxn_id")
+
+  if(length(rxnDirs) == 1) {
+    query <- paste0(query, " and rxn.direction = '",rxnDirs[1],"'")
+  } else if(length(rxnDirs)>1) {
+    query <- paste0(query, " and rxn.direction in (",listToQueryString(rxnDirs),")")
+  } else {
+    print("rxnDirs must be of length > 0")
+  }
+
+  if(humanProtein) {
+    query <- paste0(query," and rxn.has_human_prot = 1")
+  }
+
+  if(onlyHumanMets) {
+    query <- paste0(query, " and rxn.only_human_mets = 1")
+  }
+
+  if(!includeTransportRxns) {
+    query <- paste0(query, " and rxn.is_transport = 0")
+  }
+
+  conn <- connectToRaMP()
+  df <- RMariaDB::dbGetQuery(conn,query)
+  RMariaDB::dbDisconnect(conn=conn)
+  return(df)
+}
+
+
+
+#' getReactionsForRaMPGeneIds returns reactions for a collection of ramp compound ids
+#'
+#' @param rampGeneIds list of ramp compound ids
+#' @param onlyHumanMets boolean to only return pathways containing only human metabolites (ChEBI ontology) (dev in progress)
+#' @param humanProtein boolean to only control pathways catalyzed by a human proteins (having human Uniprot) (dev in progress)
+#' @param includeTransportRxns if TRUE, returns metabolic and transport reactions
+#'
+#' @return returns a dataframe of reaction information for each ramp compound id
+getReactionsForRaMPGeneIds <- function(rampGeneIds, onlyHumanMets=F, humanProtein=F, includeTransportRxns=F, rxnDirs=c("UN")) {
+
+  idStr <- listToQueryString(rampGeneIds)
+  query <- paste0("select gr.ramp_rxn_id, gr.ramp_gene_id, gr.uniprot, gr.protein_name,
+  gr.ramp_rxn_id, rxn.rxn_source_id, rxn.is_transport, rxn.label, rxn.direction, rxn.equation, rxn.html_equation, rxn.ec_num, rxn.has_human_prot, rxn.only_human_mets
+  from reaction2protein gr, reaction rxn
+  where gr.ramp_gene_id in (",idStr,") and rxn.ramp_rxn_id = gr.ramp_rxn_id")
+
+  if(length(rxnDirs) == 1) {
+    query <- paste0(query, " and rxn.direction = '",rxnDirs[1],"'")
+  } else if(length(rxnDirs)>1) {
+    query <- paste0(query, " and rxn.direction in (",listToQueryString(rxnDirs),")")
+  } else {
+    print("rxnDirs must be of length > 0")
+  }
+
+  if(humanProtein) {
+    query <- paste0(query, " and rxn.has_human_prot = 1")
+  }
+
+  if(onlyHumanMets) {
+    query <- paste0(query, " and rxn.only_human_mets = 1")
+  }
+
+  if(!includeTransportRxns) {
+    query <- paste0(query, " and rxn.is_transport = 0")
+  }
+
+  conn <- connectToRaMP()
+  df <- RMariaDB::dbGetQuery(conn,query)
+  RMariaDB::dbDisconnect(conn=conn)
+  return(df)
+}
+
+
+#####################################################
+###################################
+#
+# general dev note... perhaps the functions below should be moved to rampQueryHelpers
+#
+##
+
+
+
+#' getRampSourceInfoFromAnalyteIDs Utility method to extract source table information from analyte ids
+#'
+#' @param analytes list of analyte ids
+#'
+#' @return returns a dataframe of ramp analyte source information
+getRampSourceInfoFromAnalyteIDs <- function(analytes) {
+
+  analyteStr <- listToQueryString(analytes)
+
+  query = paste("select distinct sourceId, rampId, geneOrCompound from source where sourceId in (",analyteStr,")")
+
+  conn <- connectToRaMP()
+  df <- RMariaDB::dbGetQuery(conn,query)
+  RMariaDB::dbDisconnect(conn=conn)
+
+  return(df)
+}
+
+
+#' listToQueryString utility method to convert an id list to a comma separate string, with single quoted values.
+#'
+#' @param analytes list of analytes (can be names or ids)
+#'
+#' @return comma separated list of single quoted analyte ids or names
+listToQueryString <- function(analytes) {
+  analyteStr <- paste0("'", paste0(analytes, collapse = "','"), "'", sep="")
+  return (analyteStr)
+}