Merge pull request #31 from ncats/dev

update default background value for enrichment to database, patch fin…
ncats · Apr 6, 2022 · bf4ff54 · bf4ff54
2 parents 64a2d6c + e36c7f2
commit bf4ff54
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,7 @@
 .Rhistory
 .RData
 inst/shinyApp/db.properties.template
-
+dbprops.txt
 # IDE - VSCode
 .vscode/
 # !.vscode/settings.json

diff --git a/R/ReturnPathwaysEnrich_InputAnalytes.R b/R/ReturnPathwaysEnrich_InputAnalytes.R
@@ -9,14 +9,14 @@
 #' @param min_path_size the minimum number of pathway members (genes and metabolites) to include the pathway in the output (default = 5)
 #' @param max_path_size the maximum number of pathway memnbers (genes and metaboltes) to include the pathway in the output (default = 150)
 #' @param background_type type of background that is input by the user.  Opions are "database" if user wants all
-#' analytes from the RaMP database will be used; "file", if user wnats to input a file with a list of background
+#' analytes from the RaMP database will be used; "file", if user wants to input a file with a list of background
 #' analytes; "list", if user wants to input a vector of analyte IDs; "biospecimen", if user wants to specify a
-#' biospecimen type (e.g. blood, adipose, etc.) and have those biospecimen-specific analytes used.  For genes,
+#' biospecimen type (e.g. blood, adipose tissue, etc.) and have those biospecimen-specific analytes used.  For genes,
 #' only the "database" option is used.
 #' @param background background to be used for Fisher's tests.  If parameter 'background_type="database"', this parameter
-#' is ignored (default=NULL); if parameter 'background_type= "file"', then 'background' should be a file name (with
+#' is ignored (default="database"); if parameter 'background_type= "file"', then 'background' should be a file name (with
 #' directory); if 'background_type="list"', then 'background' should be a vector of RaMP IDs; if 'backgroud_type="biospecimen"'
-#' then users should specify one of the following: "Blood", "Adipose", "Heart", "Urine", "Brain", "Liver", "Kidney",
+#' then users should specify one of the following: "Blood", "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney",
 #' "Saliva", and "Feces"
 #' @return a dataframe with columns containing pathway ID, fisher's p value, user analytes in pathway, and total analytes in pathway
 
@@ -25,7 +25,7 @@ runFisherTest <- function(analytes,
                           NameOrIds = "ids",
                           analyte_type = "metabolites",
                           MCall = F, alternative = "less", min_path_size=5, max_path_size=150,
-                          background_type="database", background="NULL") {
+                          background_type="database", background="database") {
 
   now <- proc.time()
   print("Fisher Testing ......")
@@ -462,16 +462,18 @@ runFisherTest <- function(analytes,
 #' @param max_path_size the maximum number of pathway memnbers (genes and metaboltes) to include the pathway in the output (default = 150)
 #' @param includeRaMPids include internal RaMP identifiers (default is "FALSE")
 #' @param background_type type of background that is input by the user.  Opions are "database" if user wants all
-#' analytes from the RaMP database will be used; "file", if user wnats to input a file with a list of background
+#' analytes from the RaMP database to be used as background; "file", if user wnats to input a file path with a list of background
 #' analytes; "list", if user wants to input a vector of analyte IDs; "biospecimen", if user wants to specify a
-#' biospecimen type (e.g. blood, adipose, etc.) and have those biospecimen-specific analytes used.  For genes,
+#' biospecimen type (e.g. blood, adipose tissue, etc.) and have those biospecimen-specific analytes used.  For genes,
 #' only the "database" option is used.
 #' @param background background to be used for Fisher's tests.  If parameter 'background_type="database"', this parameter
-#' is ignored (default=NULL); if parameter 'background_type= "file"', then 'background' should be a file name (with
+#' is ignored (default="database"); if parameter 'background_type= "file"', then 'background' should be a file name (with
 #' directory); if 'background_type="list"', then 'background' should be a vector of RaMP IDs; if 'backgroud_type="biospecimen"'
-#' then users should specify one of the following: "Blood", "Adipose", "Heart", "Urine", "Brain", "Liver", "Kidney",
+#' then users should specify one of the following: "Blood", "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney",
 #' "Saliva", and "Feces"
-#' @return a list containing two entries: [[1]] fishresults, a dataframe containing pathways with Fisher's p values (raw and with FDR and Holm adjustment), number of user analytes in pathway, total number of analytes in pathway, and pathway source ID/database. [[2]] analyte_type, a string specifying the type of analyte input into the function ("genes", "metabolites", or "both")
+#' @return a list containing two entries: [[1]] fishresults, a dataframe containing pathways with Fisher's p values
+#' (raw and with FDR and Holm adjustment), number of user analytes in pathway, total number of analytes in pathway,
+#' and pathway source ID/database. [[2]] analyte_type, a string specifying the type of analyte input into the function ("genes", "metabolites", or "both")
 #' @examples
 #' \dontrun{
 #' pkg.globals <- setConnectionToRaMP(
@@ -494,7 +496,7 @@ runCombinedFisherTest <- function(analytes,
                                   max_path_size = 150,
                                   includeRaMPids = FALSE,
 				  background_type = "database",
-				  background = NULL) {
+				  background = "database") {
 
   G <- M <- 0
 
@@ -520,24 +522,24 @@ runCombinedFisherTest <- function(analytes,
   }
 
   # Grab pathways that contain genes to run Fisher on genes
-    ## fishgene <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ]
-    ## Genes are not evaluated if custom background is specified
-    if(background_type == "database"){
-        print("Running Fisher's tests on genes")
-        outgene <- runFisherTest(
-            analytes = analytes,
-            analyte_type = "genes",
-            total_genes = total_genes,
-            MCall = MCall,
-            min_path_size = min_path_size,
-            max_path_size = max_path_size
-        )
-        pathwaydf_gene <- outgene[[2]]
-        outgene <- outgene[[1]]
-    }else{
-        outgene <- NULL
-        pathwaydf_gene <- NULL
-    }
+  ## fishgene <- pathwaydf[grep("RAMP_G_", pathwaydf$rampId), ]
+  ## Genes are not evaluated if custom background is specified
+  if(background_type == "database"){
+    print("Running Fisher's tests on genes")
+    outgene <- runFisherTest(
+      analytes = analytes,
+      analyte_type = "genes",
+      total_genes = total_genes,
+      MCall = MCall,
+      min_path_size = min_path_size,
+      max_path_size = max_path_size
+    )
+    pathwaydf_gene <- outgene[[2]]
+    outgene <- outgene[[1]]
+  }else{
+    outgene <- NULL
+    pathwaydf_gene <- NULL
+  }
 
   # if no ids map to pathways, return an empty result.
   if((is.null(pathwaydf_metab) || nrow(pathwaydf_metab) < 1) &&
@@ -568,7 +570,7 @@ runCombinedFisherTest <- function(analytes,
       out[keepers, ],
       by = "pathwayRampId"
     )
-  } else if (!is.null(outgene) & is.null(outmetab)) {
+  } else if (!is.null(outgene) && is.null(outmetab)) {
     out <- outgene
     fdr <- stats::p.adjust(out$Pval, method = "fdr")
     out <- cbind(out, fdr)
@@ -821,11 +823,20 @@ getPathwayFromAnalyte <- function(analytes = "none",
 
 findCluster <- function(fishers_df, perc_analyte_overlap = 0.5,
                         min_pathway_tocluster = 2, perc_pathway_overlap = 0.5) {
+
   print("Clustering pathways...")
+
   if (perc_analyte_overlap <= 0 || perc_analyte_overlap >= 1 ||
     perc_pathway_overlap <= 0 || perc_pathway_overlap >= 1) {
-    return(NULL)
+    warning("No Clustering. perc_analyte_overlap and percent_pathway_overlap must bee in the range of (0,1), exclusive (not exactly 0 or 1).")
+    return(fishers_df)
   }
+
+  if(is.null(fishers_df$fishresults) || nrow(fishers_df$fishresults) < 1) {
+    warning("The contained input pathway dataframe is empty (fishers_df$fishresults). Returning input result without clustering.")
+    return(fishers_df)
+  }
+
   analyte_type <- fishers_df$analyte_type
   fishers_df <- fishers_df$fishresults
   list_pathways <- fishers_df %>% dplyr::pull("pathwayId")

diff --git a/R/rampChemClassQueries.R b/R/rampChemClassQueries.R
@@ -4,11 +4,12 @@
 #'
 #' @param mets a list object of source prepended metaboite ids, representing a metabolite set of interest
 #' @param background an optional list of source prepended metaboite ids to be used as the background reference of
-#' metabolites for enrichment. The background can be either a list of ids or can be a file name containing the id list,
-#' one id per column, no file header rows.
+#' metabolites for enrichment. The background can be either a list of ids, a file name containing the id list,
+#' one id per column (no file header row) or a specificed biospecimen type (available biospecimen types: "Blood",
+#' "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney","Saliva", or "Feces").
 #' @param background_type one of 'database' (all analytes in the RaMP Database), 'list' (a list of input ids),
-#' or 'file' in which case the background parameter will be a file name, or 'biospecimin' where the specified background parameter is
-#' a RaMP HMDB metabolite ontology term (use RaMP::getOntologies() to see a list of available ontology terms).
+#' or 'file' in which case the background parameter will be a file path, or 'biospecimen' where the specified background parameter is
+#' a RaMP HMDB metabolite ontology term (see background parameter, above, for the most common biospecimen background values).
 #' @param includeRaMPids include internal RaMP identifiers (default is "FALSE")
 #' @return Returns chemcial class information data including class count tallies and comparisons between metabolites of interest and the metabolite population,
 #' metabolite mappings to classes, and query summary report indicating the number of input metabolites that were resolved and listing those metabolite ids
@@ -65,7 +66,7 @@
 #' metClassResult$query_report
 #'}
 #' @export
-chemicalClassSurvey <- function(mets, background = "NULL", background_type="database", includeRaMPids = FALSE){
+chemicalClassSurvey <- function(mets, background = "database", background_type="database", includeRaMPids = FALSE){
   conn <- connectToRaMP()
   print("Starting Chemical Class Survey")
 
@@ -183,11 +184,12 @@ chemicalClassSurvey <- function(mets, background = "NULL", background_type="data
 #'
 #' @param mets a vector of source prepended metabolite ids
 #' @param background an optional list of source prepended metaboite ids to be used as the background reference of
-#' metabolites for enrichment. The background can be either a list of ids or can be a file name containing the id list,
-#' one id per column, no file header rows.
+#' metabolites for enrichment. The background can be either a list of ids, a file name containing the id list,
+#' one id per column (no file header row) or a specificed biospecimen type (available biospecimen types: "Blood",
+#' "Adipose tissue", "Heart", "Urine", "Brain", "Liver", "Kidney","Saliva", or "Feces").
 #' @param background_type one of 'database' (all analytes in the RaMP Database), 'list' (a list of input ids),
-#' or 'file' in which case the background parameter will be a file name, or 'biospecimin' where the specified background parameter is
-#' a RaMP HMDB metabolite ontology term (use RaMP::getOntologies() to see a list of available ontology terms).
+#' or 'file' in which case the background parameter will be a file path, or 'biospecimen' where the specified background parameter is
+#' a RaMP HMDB metabolite ontology term (see background parameter, above. for the most common biospecimen background values).
 #' @return a list of dataframes, each holding chemical classs enrichment statistics for specific chemical classification systems,
 #' such as HMDB Classyfire class categories and LIPIDMAPS class categories.  The results list chemical classes, metabolite hits counts,
 #' Fisher Exact p-values and Benjamini-Hochberg corrected p-values (FDR estimates)
@@ -216,7 +218,7 @@ chemicalClassSurvey <- function(mets, background = "NULL", background_type="data
 #' enrichedClassStats <- chemicalClassEnrichment(mets = metList)
 #'}
 #' @export
-chemicalClassEnrichment <- function(mets, background = "NULL", background_type = "list") {
+chemicalClassEnrichment <- function(mets, background = "database", background_type = "database") {
   print("Starting Chemical Class Enrichment")
 
   classData <- chemicalClassSurvey(mets = mets,

diff --git a/man/chemicalClassEnrichment.Rd b/man/chemicalClassEnrichment.Rd
diff --git a/man/chemicalClassSurvey.Rd b/man/chemicalClassSurvey.Rd
diff --git a/man/runCombinedFisherTest.Rd b/man/runCombinedFisherTest.Rd
diff --git a/man/runFisherTest.Rd b/man/runFisherTest.Rd