signature updates and cancer gene rank function

sigven · Sep 8, 2024 · 631209d · 631209d
1 parent 83244b0
commit 631209d
Show file tree

Hide file tree

Showing 18 changed files with 2,777 additions and 602 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -58,7 +58,7 @@ VignetteBuilder: knitr
 License: MIT + file LICENSE
 Encoding: UTF-8
 Remotes: jespermaag/gganatogram
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE)
 Config/testthat/edition: 3
 LazyData: true

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(cancer_association_rank)
 export(load_db)
 export(onco_enrich)
 export(write)

diff --git a/R/disease_drug.R b/R/disease_drug.R
@@ -1,4 +1,199 @@
+#' Rank a gene list according to cancer relevance to a tumor type/site
+#'
+#' Function that ranks a list of human gene identifiers with respect to
+#' strength of association to a particular tumor type/site. Underlying association
+#' evidence for the cancer rank per site is pulled from the Open Targets Platform,
+#' see more details \href{https://sigven.github.io/oncoEnrichR/articles/cancer_gene_rank.html}{here}.
+#'
+#' @param query character vector with gene/query identifiers (minimum 1, maximum 2,500)
+#' @param oeDB oncoEnrichR data repository object - as returned from `load_db()`
+#' @param query_id_type character indicating source of query (one of
+#' "uniprot_acc", "symbol","entrezgene", or "ensembl_gene", "ensembl_mrna",
+#' "refseq_transcript_id", "ensembl_protein", "refseq_protein")
+#' @param ignore_id_err logical indicating if analysis should
+#' continue when uknown query identifiers are encountered
+#' @param incude_gene_summary logical indicating if gene summary (NCBI/UniProt) should be included
+#' in output data tables
+#' @param tumor_site character indicating primary tumor site of interest
+#'
+#' @return
+#' A list with two data frames: query genes ranked according to cancer relevance
+#' for the specified tumor site, and unranked query genes (no evidence found)
+#'
+#' @export
+#'
+cancer_association_rank <- function(
+    query = NULL,
+    tumor_site = "Breast",
+    query_id_type = "symbol",
+    ignore_id_err = TRUE,
+    include_gene_summary = FALSE,
+    oeDB = NULL){
 
+  lgr::lgr$appenders$console$set_layout(
+    lgr::LayoutFormat$new(timestamp_fmt = "%Y-%m-%d %T"))
+
+  if (is.null(oeDB)) {
+    lgr::lgr$info( paste0(
+      "ERROR: mandatory argument 'oeDB' cannot be NULL"))
+    return()
+  }
+  oedb_val <- validate_db(oeDB)
+  if (oedb_val != 0) {
+    return()
+  }
+
+  val <- assertthat::validate_that(
+    query_id_type %in%
+      c("symbol", "entrezgene",
+        "refseq_transcript_id", "ensembl_mrna",
+        "refseq_protein", "ensembl_protein",
+        "uniprot_acc",
+        "ensembl_gene")
+  )
+  if (!is.logical(val)) {
+    lgr::lgr$info( paste0(
+      "ERROR: 'query_id_type' must take on of the following values: ",
+      "'symbol', 'entrezgene', 'refseq_transcript_id', 'ensembl_mrna', ",
+      "'refseq_protein', 'ensembl_protein', 'uniprot_acc', 'ensembl_gene'",
+      " (value provided was '", query_id_type,"')"))
+    return()
+  }
+
+  tumor_sites <-
+    c('Adrenal Gland', 'Ampulla of Vater', 'Biliary Tract',
+    'Bladder/Urinary Tract', 'Bone', 'Breast', 'CNS/Brain',
+    'Cervix', 'Colon/Rectum', 'Esophagus/Stomach',
+    'Eye', 'Head and Neck', 'Kidney', 'Liver',
+    'Lung', 'Lymphoid', 'Myeloid',
+    'Ovary/Fallopian Tube', 'Pancreas',
+    'Penis', 'Peripheral Nervous System',
+    'Peritoneum', 'Pleura', 'Prostate',
+    'Skin', 'Soft Tissue', 'Testis',
+    'Thymus', 'Thyroid',
+    'Uterus', 'Vulva/Vagina')
+
+  if(!(tumor_site %in% tumor_sites)){
+    lgr::lgr$info( paste0(
+      "ERROR: argument 'tumor_site' must have a value in the following list: '",
+      paste(tumor_sites, collapse="', '"),"'"))
+    return()
+  }
+
+  if (is.null(query)) {
+    lgr::lgr$info( paste0(
+      "ERROR: mandatory argument 'query' cannot be NULL"))
+    return()
+  }
+  if (!is.character(query)) {
+    lgr::lgr$info( paste0(
+      "ERROR: mandatory argument 'query' is of wrong type (not character)"))
+    return()
+  }
+
+  if (length(query) == 0) {
+    lgr::lgr$info( paste0(
+      "ERROR: mandatory argument 'query' is empty (length = 0)"))
+    return()
+  }
+
+
+  ## validate query gene set
+  qgenes_match <-
+    validate_query_genes(
+      qgenes = query,
+      q_id_type = query_id_type,
+      ignore_id_err = ignore_id_err,
+      genedb = oeDB[['genedb']][['all']],
+      transcript_xref = oeDB[['genedb']][['transcript_xref']])
+
+  val <- assertthat::validate_that(NROW(qgenes_match$found) >= 1)
+  if (!is.logical(val)) {
+    lgr::lgr$info( paste0(
+      "ERROR: query set must contain at least one valid entry - ",
+      "number of validated entries: ",
+      NROW(qgenes_match$found)))
+    return()
+  }
+
+  if(NROW(qgenes_match$found) > 2500){
+    lgr::lgr$warn( paste0(
+      "Query set must exceeds max limit of 2,500 valid entries - ",
+      "limiting input to 2,500 entries"))
+    qgenes_match[['found']] <- head(qgenes_match[['found']], 2500)
+  }
+
+  lgr::lgr$info( paste0(
+    "Generating rank of query set according to cancer association ",
+    "to '", tumor_site,"'"))
+
+  ## get validated gene set
+  targets_validated <-
+    qgenes_match[['found']] |>
+    dplyr::select(
+      c("symbol","entrezgene")) |>
+    dplyr::left_join(
+      dplyr::select(
+        oeDB$genedb$all,
+        c("ensembl_gene_id", "entrezgene",
+          "name", "gene_biotype","gene_summary")),
+      by = "entrezgene"
+    )
+
+  targets_validated$gene_summary <-
+    stringr::str_trim(
+      textclean::replace_html(targets_validated$gene_summary),
+      side = "both")
+
+  if(!include_gene_summary){
+    targets_validated <-
+      dplyr::select(targets_validated, -c("gene_summary"))
+  }
+
+  site_rank <- oeDB$otdb$gene_rank |>
+    dplyr::filter(.data$primary_site == tumor_site)
+
+  ## get cancer gene rank
+  cancer_rank <-
+    targets_validated |>
+    dplyr::left_join(site_rank,
+                     by = "ensembl_gene_id") |>
+    dplyr::arrange(dplyr::desc(.data$tissue_assoc_rank)) |>
+    dplyr::rename(
+      tumor_site = "primary_site",
+      site_assoc_rank = "tissue_assoc_rank",
+      site_assoc_score = "tissue_assoc_score",
+    )
+
+  ## genes with unknown rank
+  n_genes_no_site_associations <-
+    cancer_rank |>
+    dplyr::filter(is.na(.data$site_assoc_rank)) |>
+    NROW()
+
+  result <- list()
+  result[['ranked']] <- data.frame()
+  result[['unranked']] <- data.frame()
+
+  result[['ranked']] <- cancer_rank |>
+    dplyr::filter(!is.na(.data$site_assoc_rank))
+
+  if(n_genes_no_site_associations > 0){
+    result[['unranked']] <- cancer_rank |>
+      dplyr::filter(is.na(.data$site_assoc_rank))
+
+    pct_unknown_assoc <-
+      round((n_genes_no_site_associations /
+              NROW(cancer_rank) * 100), 2)
+    lgr::lgr$warn( paste0(
+      pct_unknown_assoc, "% of the ",
+      "query set have no/non-significant cancer association ",
+      "with '", tumor_site,"'"))
+  }
+
+  return(result)
+
+}
 
 target_disease_associations <-
   function(qgenes,

diff --git a/R/enrich.R b/R/enrich.R
@@ -75,6 +75,9 @@ get_go_enrichment <- function(query_entrez,
           go_description = "Description",
           count = "Count",
           gene_ratio = "GeneRatio",
+          rich_factor = "RichFactor",
+          fold_enrichment = "FoldEnrichment",
+          z_score = "zScore",
           background_ratio = "BgRatio",
           gene_id = "geneID") |>
         dplyr::mutate(
@@ -242,6 +245,9 @@ get_universal_enrichment <- function(query_entrez,
           description = "Description",
           count = "Count",
           gene_ratio = "GeneRatio",
+          rich_factor = "RichFactor",
+          fold_enrichment = "FoldEnrichment",
+          z_score = "zScore",
           background_ratio = "BgRatio",
           gene_id = "geneID")
     )
@@ -257,6 +263,7 @@ get_universal_enrichment <- function(query_entrez,
         dplyr::mutate(
           exact_source = "https://wikipathways.org",
           external_url = "https://wikipathways.org",
+          url = "https://wikipathways.org",
           db = dbsource)
     }
     else if (dbsource == "KEGG") {
@@ -271,6 +278,7 @@ get_universal_enrichment <- function(query_entrez,
         dplyr::mutate(
           exact_source = "https://www.genome.jp/kegg/pathway.html",
           external_url = "https://www.genome.jp/kegg/pathway.html",
+          url = "https://www.genome.jp/kegg/pathway.html",
           db = dbsource)
     }
     else if (dbsource == "NetPath") {
@@ -284,18 +292,21 @@ get_universal_enrichment <- function(query_entrez,
         dplyr::mutate(
           exact_source = "http://netpath.org",
           external_url = "http://netpath.org",
+          url = "http://netpath.org",
           db = dbsource)
     }
     else{
       stopifnot(!is.null(TERM2SOURCE) | !is.data.frame(TERM2SOURCE))
       stopifnot("standard_name" %in% colnames(TERM2SOURCE))
       df <- df |>
-        dplyr::left_join(TERM2SOURCE, by="standard_name", relationship = "many-to-many") |>
+        dplyr::left_join(
+          TERM2SOURCE, by="standard_name",
+          relationship = "many-to-many") |>
         dplyr::mutate(
           description_link =
             dplyr::if_else(
-              !is.na(.data$external_url),
-              paste0("<a href='",.data$external_url,
+              !is.na(.data$url),
+              paste0("<a href='",.data$url,
                      "' target='_blank'>",
                      .data$description,"</a>"),
               .data$description))

diff --git a/R/onco_enrichr.R b/R/onco_enrichr.R
@@ -918,20 +918,6 @@ onco_enrich <- function(query = NULL,
     return()
   }
 
-  # val <- assertthat::validate_that(
-  #   html_report_theme %in% c("bootstrap","cerulean","cosmo","default",
-  #                       "flatly","journal","lumen","paper","sandstone",
-  #                       "simplex","spacelab","united","yeti")
-  # )
-  # if (!is.logical(val)) {
-  #   lgr::lgr$info( paste0(
-  #     "ERROR: 'html_report_theme' must take on any of the following values: ",
-  #     "'bootstrap', 'cerulean', 'cosmo', 'default', 'flatly', 'journal', 'lumen',",
-  #     "'paper', 'sandstone', 'simplex', 'spacelab', 'united', 'yeti'",
-  #     " (value provided was '", enrichment_p_value_adj,"')"))
-  #   return()
-  # }
-
   ## Number of allowed query genes
   oncoenrichr_query_limit <- 1000
 
@@ -1510,7 +1496,7 @@ onco_enrich <- function(query = NULL,
       onc_rep[["config"]][["fitness"]][["plot_height_fitness"]]  <-
         onc_rep[["config"]][["fitness"]][["plot_height_fitness"]] +
         as.integer((
-          onc_rep[["data"]][["fitness"]][["fitness_scores"]][["n_targets"]] - 20)/8.5)
+          onc_rep[["data"]][["fitness"]][["fitness_scores"]][["n_targets"]] - 20)/5.6)
     }
 
     onc_rep[["data"]][["fitness"]][["target_priority_scores"]] <-

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data_processing_code/RELEASE_NOTES.txt b/data_processing_code/RELEASE_NOTES.txt
@@ -1,6 +1,6 @@
-##ONCOENRICHR_DB_VERSION = 20240803
+##ONCOENRICHR_DB_VERSION = 20240906
 name	url	description	version	key	resource_type	license
-oncoEnrichR	https://github.com/sigven/oncoEnrichR	R package for functional interrogation of genesets in the context of cancer	v1.5.1	oncoEnrichR	software	MIT
+oncoEnrichR	https://github.com/sigven/oncoEnrichR	R package for functional interrogation of genesets in the context of cancer	v1.5.2	oncoEnrichR	software	MIT
 clusterProfiler	https://guangchuangyu.github.io/software/clusterProfiler/	A universal enrichment tool for interpreting omics data (R package)	v4.12.0	clusterProfiler	software	Artistic-2.0
 OmnipathR	https://omnipathdb.org/	Access to datasets on prior molecular knowledge: gene regulatory interactions, enzyme-PTM relationships, protein complexes, protein annotations etc.	v3.11.10	omnipathr	software	MIT
 hu.MAP	http://humap2.proteincomplexes.org/	Human Protein Complex Map	v2.0	humap2	db	CC0 1.0
@@ -12,20 +12,20 @@ phenOncoX	https://github.com/sigven/phenOncoX	Crossmapped phenotype ontologies f
 STRING	https://string-db.org	Protein-protein interaction database	v12.0	string	db	CC-BY 4.0
 BIOGRID	https://thebiogrid.org/	Database of Protein, Genetic and Chemical Interactions	v4.4.236	biogrid	db	MIT
 GENCODE	https://www.gencodegenes.org/	High quality reference gene annotation and experimental validation	v46	gencode	db	Open Access
-The Cancer Genome Atlas (TCGA)	https://cancergenome.nih.gov	Tumor gene expression and somatic DNA aberrations across a large cohort of tumor samples	v39.0 (December 4th 2023)	tcga	db	Open Access
-UniProt	http://www.uniprot.org	Comprehensive resource of protein sequence and functional information	v2024_03	uniprot	db	CC-BY 4.0
+The Cancer Genome Atlas (TCGA)	https://cancergenome.nih.gov	Tumor gene expression and somatic DNA aberrations across a large cohort of tumor samples	v41.0 (August 28th 2024)	tcga	db	Open Access
+UniProt	http://www.uniprot.org	Comprehensive resource of protein sequence and functional information	v2024_04	uniprot	db	CC-BY 4.0
 NetPath	http://www.netpath.org	Manually curated resource of signal transduction pathways in humans	v1 (2010)	netpath	db	Open Access
-EFO	https://github.com/EBISPOT/efo	Experimental Factor Ontology	v3.67.0	efo	db	Apache 2.0
-DiseaseOntology	https://github.com/DiseaseOntology	Human Disease Ontology	2024-05-29	do	db	CC0 1.0
-COMPARTMENTS	https://compartments.jensenlab.org/Search	Subcellular localization database	February 2024	compartments	db	CC-BY 4.0
+EFO	https://github.com/EBISPOT/efo	Experimental Factor Ontology	v3.69.0	efo	db	Apache 2.0
+DiseaseOntology	https://github.com/DiseaseOntology	Human Disease Ontology	v2024-08-28	do	db	CC0 1.0
+COMPARTMENTS	https://compartments.jensenlab.org/Search	Subcellular localization database	August 2024	compartments	db	CC-BY 4.0
 DepMap/ProjectScore	https://depmap.org	Integrated cancer dependency dataset from Wellcome Sanger Institute and Broad Institute, Pacini et al., Nat Commun., 2021	Sanger_V1/Broad_20Q2	depmap	db	CC-BY 4.0
-WikiPathways	https://www.wikipathways.org	A database of biological pathways maintained by and for the scientific community	20240710	wikipathway	db	CC0 1.0
-MSigDB	http://software.broadinstitute.org/gsea/msigdb/index.jsp	Molecular Signatures Database - collection of annotated gene sets	March 2023 (MSigDB v2023.1.Hs)	msigdb	db	CC-BY 4.0
-REACTOME	https://reactome.org	Manually curated and peer-reviewed pathway database	v83 (MSigDB v2023.1.Hs)	reactome	db	CC-BY 4.0
+WikiPathways	https://www.wikipathways.org	A database of biological pathways maintained by and for the scientific community	20240810	wikipathway	db	CC0 1.0
+MSigDB	http://software.broadinstitute.org/gsea/msigdb/index.jsp	Molecular Signatures Database - collection of annotated gene sets	August 2024 (MSigDB v2024.1.Hs)	msigdb	db	CC-BY 4.0
+REACTOME	https://reactome.org	Manually curated and peer-reviewed pathway database	v89 (MSigDB v2024.1.Hs)	reactome	db	CC-BY 4.0
 CellChatDB	http://www.cellchat.org/cellchatdb/	Multimeric ligand-receptor complexes	v1 (2021)	cellchatdb	db	GPL v3.0
 CellTalkDB	http://tcm.zju.edu.cn/celltalkdb/	A manually curated database of literature-supported ligand-receptor interactions in human and mouse	Nov 2020	celltalkdb	db	GPL v3.0
-GeneOntology	https://geneontology.org	Knowledgebase that contains the largest structural source of information on the functions of genes	March 2023 (MSigDB v2023.1.Hs)	go	db	CC-BY 4.0
-KEGG	https://www.genome.jp/kegg/pathway.html	Collection of manually drawn pathway maps representing our knowledge on the molecular interaction, reaction and relation networks	20240626	kegg	db	.
+GeneOntology	https://geneontology.org	Knowledgebase that contains the largest structural source of information on the functions of genes	May 2024 (MSigDB v2024.1.Hs)	go	db	CC-BY 4.0
+KEGG	https://www.genome.jp/kegg/pathway.html	Collection of manually drawn pathway maps representing our knowledge on the molecular interaction, reaction and relation networks	20240904	kegg	db	.
 IntOGen	https://www.intogen.org/search	A compendium of mutational cancer driver genes	2023-05-31	intogen	db	CC0 1.0
 CancerMine	http://bionlp.bcgsc.ca/cancermine/	Literature-mined database of tumor suppressor genes/proto-oncogenes	v50 - 20230301	cancermine	db	CC0 1.0
 Network of cancer genes (NCG)	http://ncg.kcl.ac.uk/index.php	A web resource to analyze duplicability, orthology and network properties of cancer genes	v7.1	ncg	db	Open Access

diff --git a/data_processing_code/data_raw.R b/data_processing_code/data_raw.R
@@ -2,11 +2,11 @@ library(gganatogram)
 
 source('data_processing_code/data_utility_functions.R')
 
-msigdb_version <- 'v2023.1.Hs'
-wikipathways_version <- "20240710"
+msigdb_version <- 'v2024.1.Hs'
+wikipathways_version <- "20240810"
 netpath_version <- "2010"
 opentargets_version <- "2024.06"
-kegg_version <- "20240626"
+kegg_version <- "20240904"
 gencode_version <- "46"
 uniprot_release <- "2024_04"
 
@@ -21,7 +21,7 @@ db_updates[['omnipath_regulatory']] <- F
 db_updates[['subcelldb']] <- F
 db_updates[['ligand_receptor_db']] <- F
 
-oe_version <- "1.5.1"
+oe_version <- "1.5.2"
 
 data_raw_dir <-
   "/Users/sigven/project_data/packages/package__oncoEnrichR/db/raw"