Skip to content

Commit

Permalink
signature updates and cancer gene rank function
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Sep 8, 2024
1 parent 83244b0 commit 631209d
Show file tree
Hide file tree
Showing 18 changed files with 2,777 additions and 602 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ VignetteBuilder: knitr
License: MIT + file LICENSE
Encoding: UTF-8
Remotes: jespermaag/gganatogram
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
Config/testthat/edition: 3
LazyData: true
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(cancer_association_rank)
export(load_db)
export(onco_enrich)
export(write)
Expand Down
195 changes: 195 additions & 0 deletions R/disease_drug.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,199 @@
#' Rank a gene list according to cancer relevance to a tumor type/site
#'
#' Function that ranks a list of human gene identifiers with respect to
#' strength of association to a particular tumor type/site. Underlying association
#' evidence for the cancer rank per site is pulled from the Open Targets Platform,
#' see more details \href{https://sigven.github.io/oncoEnrichR/articles/cancer_gene_rank.html}{here}.
#'
#' @param query character vector with gene/query identifiers (minimum 1, maximum 2,500)
#' @param oeDB oncoEnrichR data repository object - as returned from `load_db()`
#' @param query_id_type character indicating source of query (one of
#' "uniprot_acc", "symbol","entrezgene", or "ensembl_gene", "ensembl_mrna",
#' "refseq_transcript_id", "ensembl_protein", "refseq_protein")
#' @param ignore_id_err logical indicating if analysis should
#' continue when uknown query identifiers are encountered
#' @param incude_gene_summary logical indicating if gene summary (NCBI/UniProt) should be included
#' in output data tables
#' @param tumor_site character indicating primary tumor site of interest
#'
#' @return
#' A list with two data frames: query genes ranked according to cancer relevance
#' for the specified tumor site, and unranked query genes (no evidence found)
#'
#' @export
#'
cancer_association_rank <- function(
query = NULL,
tumor_site = "Breast",
query_id_type = "symbol",
ignore_id_err = TRUE,
include_gene_summary = FALSE,
oeDB = NULL){

lgr::lgr$appenders$console$set_layout(
lgr::LayoutFormat$new(timestamp_fmt = "%Y-%m-%d %T"))

if (is.null(oeDB)) {
lgr::lgr$info( paste0(
"ERROR: mandatory argument 'oeDB' cannot be NULL"))
return()
}
oedb_val <- validate_db(oeDB)
if (oedb_val != 0) {
return()
}

val <- assertthat::validate_that(
query_id_type %in%
c("symbol", "entrezgene",
"refseq_transcript_id", "ensembl_mrna",
"refseq_protein", "ensembl_protein",
"uniprot_acc",
"ensembl_gene")
)
if (!is.logical(val)) {
lgr::lgr$info( paste0(
"ERROR: 'query_id_type' must take on of the following values: ",
"'symbol', 'entrezgene', 'refseq_transcript_id', 'ensembl_mrna', ",
"'refseq_protein', 'ensembl_protein', 'uniprot_acc', 'ensembl_gene'",
" (value provided was '", query_id_type,"')"))
return()
}

tumor_sites <-
c('Adrenal Gland', 'Ampulla of Vater', 'Biliary Tract',
'Bladder/Urinary Tract', 'Bone', 'Breast', 'CNS/Brain',
'Cervix', 'Colon/Rectum', 'Esophagus/Stomach',
'Eye', 'Head and Neck', 'Kidney', 'Liver',
'Lung', 'Lymphoid', 'Myeloid',
'Ovary/Fallopian Tube', 'Pancreas',
'Penis', 'Peripheral Nervous System',
'Peritoneum', 'Pleura', 'Prostate',
'Skin', 'Soft Tissue', 'Testis',
'Thymus', 'Thyroid',
'Uterus', 'Vulva/Vagina')

if(!(tumor_site %in% tumor_sites)){
lgr::lgr$info( paste0(
"ERROR: argument 'tumor_site' must have a value in the following list: '",
paste(tumor_sites, collapse="', '"),"'"))
return()
}

if (is.null(query)) {
lgr::lgr$info( paste0(
"ERROR: mandatory argument 'query' cannot be NULL"))
return()
}
if (!is.character(query)) {
lgr::lgr$info( paste0(
"ERROR: mandatory argument 'query' is of wrong type (not character)"))
return()
}

if (length(query) == 0) {
lgr::lgr$info( paste0(
"ERROR: mandatory argument 'query' is empty (length = 0)"))
return()
}


## validate query gene set
qgenes_match <-
validate_query_genes(
qgenes = query,
q_id_type = query_id_type,
ignore_id_err = ignore_id_err,
genedb = oeDB[['genedb']][['all']],
transcript_xref = oeDB[['genedb']][['transcript_xref']])

val <- assertthat::validate_that(NROW(qgenes_match$found) >= 1)
if (!is.logical(val)) {
lgr::lgr$info( paste0(
"ERROR: query set must contain at least one valid entry - ",
"number of validated entries: ",
NROW(qgenes_match$found)))
return()
}

if(NROW(qgenes_match$found) > 2500){
lgr::lgr$warn( paste0(
"Query set must exceeds max limit of 2,500 valid entries - ",
"limiting input to 2,500 entries"))
qgenes_match[['found']] <- head(qgenes_match[['found']], 2500)
}

lgr::lgr$info( paste0(
"Generating rank of query set according to cancer association ",
"to '", tumor_site,"'"))

## get validated gene set
targets_validated <-
qgenes_match[['found']] |>
dplyr::select(
c("symbol","entrezgene")) |>
dplyr::left_join(
dplyr::select(
oeDB$genedb$all,
c("ensembl_gene_id", "entrezgene",
"name", "gene_biotype","gene_summary")),
by = "entrezgene"
)

targets_validated$gene_summary <-
stringr::str_trim(
textclean::replace_html(targets_validated$gene_summary),
side = "both")

if(!include_gene_summary){
targets_validated <-
dplyr::select(targets_validated, -c("gene_summary"))
}

site_rank <- oeDB$otdb$gene_rank |>
dplyr::filter(.data$primary_site == tumor_site)

## get cancer gene rank
cancer_rank <-
targets_validated |>
dplyr::left_join(site_rank,
by = "ensembl_gene_id") |>
dplyr::arrange(dplyr::desc(.data$tissue_assoc_rank)) |>
dplyr::rename(
tumor_site = "primary_site",
site_assoc_rank = "tissue_assoc_rank",
site_assoc_score = "tissue_assoc_score",
)

## genes with unknown rank
n_genes_no_site_associations <-
cancer_rank |>
dplyr::filter(is.na(.data$site_assoc_rank)) |>
NROW()

result <- list()
result[['ranked']] <- data.frame()
result[['unranked']] <- data.frame()

result[['ranked']] <- cancer_rank |>
dplyr::filter(!is.na(.data$site_assoc_rank))

if(n_genes_no_site_associations > 0){
result[['unranked']] <- cancer_rank |>
dplyr::filter(is.na(.data$site_assoc_rank))

pct_unknown_assoc <-
round((n_genes_no_site_associations /
NROW(cancer_rank) * 100), 2)
lgr::lgr$warn( paste0(
pct_unknown_assoc, "% of the ",
"query set have no/non-significant cancer association ",
"with '", tumor_site,"'"))
}

return(result)

}

target_disease_associations <-
function(qgenes,
Expand Down
17 changes: 14 additions & 3 deletions R/enrich.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ get_go_enrichment <- function(query_entrez,
go_description = "Description",
count = "Count",
gene_ratio = "GeneRatio",
rich_factor = "RichFactor",
fold_enrichment = "FoldEnrichment",
z_score = "zScore",
background_ratio = "BgRatio",
gene_id = "geneID") |>
dplyr::mutate(
Expand Down Expand Up @@ -242,6 +245,9 @@ get_universal_enrichment <- function(query_entrez,
description = "Description",
count = "Count",
gene_ratio = "GeneRatio",
rich_factor = "RichFactor",
fold_enrichment = "FoldEnrichment",
z_score = "zScore",
background_ratio = "BgRatio",
gene_id = "geneID")
)
Expand All @@ -257,6 +263,7 @@ get_universal_enrichment <- function(query_entrez,
dplyr::mutate(
exact_source = "https://wikipathways.org",
external_url = "https://wikipathways.org",
url = "https://wikipathways.org",
db = dbsource)
}
else if (dbsource == "KEGG") {
Expand All @@ -271,6 +278,7 @@ get_universal_enrichment <- function(query_entrez,
dplyr::mutate(
exact_source = "https://www.genome.jp/kegg/pathway.html",
external_url = "https://www.genome.jp/kegg/pathway.html",
url = "https://www.genome.jp/kegg/pathway.html",
db = dbsource)
}
else if (dbsource == "NetPath") {
Expand All @@ -284,18 +292,21 @@ get_universal_enrichment <- function(query_entrez,
dplyr::mutate(
exact_source = "http://netpath.org",
external_url = "http://netpath.org",
url = "http://netpath.org",
db = dbsource)
}
else{
stopifnot(!is.null(TERM2SOURCE) | !is.data.frame(TERM2SOURCE))
stopifnot("standard_name" %in% colnames(TERM2SOURCE))
df <- df |>
dplyr::left_join(TERM2SOURCE, by="standard_name", relationship = "many-to-many") |>
dplyr::left_join(
TERM2SOURCE, by="standard_name",
relationship = "many-to-many") |>
dplyr::mutate(
description_link =
dplyr::if_else(
!is.na(.data$external_url),
paste0("<a href='",.data$external_url,
!is.na(.data$url),
paste0("<a href='",.data$url,
"' target='_blank'>",
.data$description,"</a>"),
.data$description))
Expand Down
16 changes: 1 addition & 15 deletions R/onco_enrichr.R
Original file line number Diff line number Diff line change
Expand Up @@ -918,20 +918,6 @@ onco_enrich <- function(query = NULL,
return()
}

# val <- assertthat::validate_that(
# html_report_theme %in% c("bootstrap","cerulean","cosmo","default",
# "flatly","journal","lumen","paper","sandstone",
# "simplex","spacelab","united","yeti")
# )
# if (!is.logical(val)) {
# lgr::lgr$info( paste0(
# "ERROR: 'html_report_theme' must take on any of the following values: ",
# "'bootstrap', 'cerulean', 'cosmo', 'default', 'flatly', 'journal', 'lumen',",
# "'paper', 'sandstone', 'simplex', 'spacelab', 'united', 'yeti'",
# " (value provided was '", enrichment_p_value_adj,"')"))
# return()
# }

## Number of allowed query genes
oncoenrichr_query_limit <- 1000

Expand Down Expand Up @@ -1510,7 +1496,7 @@ onco_enrich <- function(query = NULL,
onc_rep[["config"]][["fitness"]][["plot_height_fitness"]] <-
onc_rep[["config"]][["fitness"]][["plot_height_fitness"]] +
as.integer((
onc_rep[["data"]][["fitness"]][["fitness_scores"]][["n_targets"]] - 20)/8.5)
onc_rep[["data"]][["fitness"]][["fitness_scores"]][["n_targets"]] - 20)/5.6)
}

onc_rep[["data"]][["fitness"]][["target_priority_scores"]] <-
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
24 changes: 12 additions & 12 deletions data_processing_code/RELEASE_NOTES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##ONCOENRICHR_DB_VERSION = 20240803
##ONCOENRICHR_DB_VERSION = 20240906
name url description version key resource_type license
oncoEnrichR https://github.com/sigven/oncoEnrichR R package for functional interrogation of genesets in the context of cancer v1.5.1 oncoEnrichR software MIT
oncoEnrichR https://github.com/sigven/oncoEnrichR R package for functional interrogation of genesets in the context of cancer v1.5.2 oncoEnrichR software MIT
clusterProfiler https://guangchuangyu.github.io/software/clusterProfiler/ A universal enrichment tool for interpreting omics data (R package) v4.12.0 clusterProfiler software Artistic-2.0
OmnipathR https://omnipathdb.org/ Access to datasets on prior molecular knowledge: gene regulatory interactions, enzyme-PTM relationships, protein complexes, protein annotations etc. v3.11.10 omnipathr software MIT
hu.MAP http://humap2.proteincomplexes.org/ Human Protein Complex Map v2.0 humap2 db CC0 1.0
Expand All @@ -12,20 +12,20 @@ phenOncoX https://github.com/sigven/phenOncoX Crossmapped phenotype ontologies f
STRING https://string-db.org Protein-protein interaction database v12.0 string db CC-BY 4.0
BIOGRID https://thebiogrid.org/ Database of Protein, Genetic and Chemical Interactions v4.4.236 biogrid db MIT
GENCODE https://www.gencodegenes.org/ High quality reference gene annotation and experimental validation v46 gencode db Open Access
The Cancer Genome Atlas (TCGA) https://cancergenome.nih.gov Tumor gene expression and somatic DNA aberrations across a large cohort of tumor samples v39.0 (December 4th 2023) tcga db Open Access
UniProt http://www.uniprot.org Comprehensive resource of protein sequence and functional information v2024_03 uniprot db CC-BY 4.0
The Cancer Genome Atlas (TCGA) https://cancergenome.nih.gov Tumor gene expression and somatic DNA aberrations across a large cohort of tumor samples v41.0 (August 28th 2024) tcga db Open Access
UniProt http://www.uniprot.org Comprehensive resource of protein sequence and functional information v2024_04 uniprot db CC-BY 4.0
NetPath http://www.netpath.org Manually curated resource of signal transduction pathways in humans v1 (2010) netpath db Open Access
EFO https://github.com/EBISPOT/efo Experimental Factor Ontology v3.67.0 efo db Apache 2.0
DiseaseOntology https://github.com/DiseaseOntology Human Disease Ontology 2024-05-29 do db CC0 1.0
COMPARTMENTS https://compartments.jensenlab.org/Search Subcellular localization database February 2024 compartments db CC-BY 4.0
EFO https://github.com/EBISPOT/efo Experimental Factor Ontology v3.69.0 efo db Apache 2.0
DiseaseOntology https://github.com/DiseaseOntology Human Disease Ontology v2024-08-28 do db CC0 1.0
COMPARTMENTS https://compartments.jensenlab.org/Search Subcellular localization database August 2024 compartments db CC-BY 4.0
DepMap/ProjectScore https://depmap.org Integrated cancer dependency dataset from Wellcome Sanger Institute and Broad Institute, Pacini et al., Nat Commun., 2021 Sanger_V1/Broad_20Q2 depmap db CC-BY 4.0
WikiPathways https://www.wikipathways.org A database of biological pathways maintained by and for the scientific community 20240710 wikipathway db CC0 1.0
MSigDB http://software.broadinstitute.org/gsea/msigdb/index.jsp Molecular Signatures Database - collection of annotated gene sets March 2023 (MSigDB v2023.1.Hs) msigdb db CC-BY 4.0
REACTOME https://reactome.org Manually curated and peer-reviewed pathway database v83 (MSigDB v2023.1.Hs) reactome db CC-BY 4.0
WikiPathways https://www.wikipathways.org A database of biological pathways maintained by and for the scientific community 20240810 wikipathway db CC0 1.0
MSigDB http://software.broadinstitute.org/gsea/msigdb/index.jsp Molecular Signatures Database - collection of annotated gene sets August 2024 (MSigDB v2024.1.Hs) msigdb db CC-BY 4.0
REACTOME https://reactome.org Manually curated and peer-reviewed pathway database v89 (MSigDB v2024.1.Hs) reactome db CC-BY 4.0
CellChatDB http://www.cellchat.org/cellchatdb/ Multimeric ligand-receptor complexes v1 (2021) cellchatdb db GPL v3.0
CellTalkDB http://tcm.zju.edu.cn/celltalkdb/ A manually curated database of literature-supported ligand-receptor interactions in human and mouse Nov 2020 celltalkdb db GPL v3.0
GeneOntology https://geneontology.org Knowledgebase that contains the largest structural source of information on the functions of genes March 2023 (MSigDB v2023.1.Hs) go db CC-BY 4.0
KEGG https://www.genome.jp/kegg/pathway.html Collection of manually drawn pathway maps representing our knowledge on the molecular interaction, reaction and relation networks 20240626 kegg db .
GeneOntology https://geneontology.org Knowledgebase that contains the largest structural source of information on the functions of genes May 2024 (MSigDB v2024.1.Hs) go db CC-BY 4.0
KEGG https://www.genome.jp/kegg/pathway.html Collection of manually drawn pathway maps representing our knowledge on the molecular interaction, reaction and relation networks 20240904 kegg db .
IntOGen https://www.intogen.org/search A compendium of mutational cancer driver genes 2023-05-31 intogen db CC0 1.0
CancerMine http://bionlp.bcgsc.ca/cancermine/ Literature-mined database of tumor suppressor genes/proto-oncogenes v50 - 20230301 cancermine db CC0 1.0
Network of cancer genes (NCG) http://ncg.kcl.ac.uk/index.php A web resource to analyze duplicability, orthology and network properties of cancer genes v7.1 ncg db Open Access
Expand Down
8 changes: 4 additions & 4 deletions data_processing_code/data_raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ library(gganatogram)

source('data_processing_code/data_utility_functions.R')

msigdb_version <- 'v2023.1.Hs'
wikipathways_version <- "20240710"
msigdb_version <- 'v2024.1.Hs'
wikipathways_version <- "20240810"
netpath_version <- "2010"
opentargets_version <- "2024.06"
kegg_version <- "20240626"
kegg_version <- "20240904"
gencode_version <- "46"
uniprot_release <- "2024_04"

Expand All @@ -21,7 +21,7 @@ db_updates[['omnipath_regulatory']] <- F
db_updates[['subcelldb']] <- F
db_updates[['ligand_receptor_db']] <- F

oe_version <- "1.5.1"
oe_version <- "1.5.2"

data_raw_dir <-
"/Users/sigven/project_data/packages/package__oncoEnrichR/db/raw"
Expand Down
Loading

0 comments on commit 631209d

Please sign in to comment.