diff --git a/DESCRIPTION b/DESCRIPTION index e4e908a..c7c4965 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: oncoEnrichR Type: Package Title: Cancer-dedicated gene set interpretation Version: 1.3.0 -Date: 2022-09-12 +Date: 2022-09-23 Authors@R: person(given = "Sigve", family = "Nakken", role = c("aut", "cre"), email = "sigven@ifi.uio.no", comment = c(ORCID = "0000-0001-8468-2050")) diff --git a/R/disease_drug.R b/R/disease_drug.R index 8696e21..2095369 100644 --- a/R/disease_drug.R +++ b/R/disease_drug.R @@ -329,7 +329,11 @@ target_drug_associations <- function(qgenes, .data$targeted_cancer_drugs_ep, .data$approved_drugs) |> dplyr::filter(!is.na(.data$targeted_cancer_drugs_lp) | - !is.na(.data$targeted_cancer_drugs_ep)) + !is.na(.data$targeted_cancer_drugs_ep)) |> + dplyr::rename( + drugs_late_phase = .data$targeted_cancer_drugs_lp, + drugs_early_phase = .data$targeted_cancer_drugs_ep + ) lgr::lgr$info( paste0("Open Targets Platform: annotation of target tractabilities (druggability)")) diff --git a/R/onco_enrichr.R b/R/onco_enrichr.R index 9f6ea8a..4c8eec1 100644 --- a/R/onco_enrichr.R +++ b/R/onco_enrichr.R @@ -1610,25 +1610,36 @@ onco_enrich <- function(query = NULL, PROTEIN_DOMAIN = dplyr::if_else( !is.na(.data$PFAM_ID), paste0( - "", .data$PFAM_DOMAIN_NAME, ""), as.character(NA) ) ) |> - dplyr::select(-c(.data$PFAM_DOMAIN_NAME, .data$PFAM_ID)) |> - dplyr::left_join(dplyr::select(oeDB[['genedb']][['all']], - .data$symbol, .data$ensembl_gene_id), - by = c("SYMBOL" = "symbol")) |> - dplyr::rename(ENSEMBL_GENE_ID = .data$ensembl_gene_id) |> - dplyr::mutate(ENSEMBL_TRANSCRIPT_ID = - paste0("", - .data$ENSEMBL_TRANSCRIPT_ID,"")) |> + dplyr::select( + -c(.data$PFAM_DOMAIN_NAME, .data$PFAM_ID)) |> + dplyr::left_join( + dplyr::select(oeDB[['genedb']][['all']], + .data$symbol, .data$ensembl_gene_id), + by = c("SYMBOL" = "symbol")) |> + dplyr::mutate( + ENSEMBL_GENE_ID = + paste0( + "", + .data$ensembl_gene_id,"")) |> + dplyr::mutate( + ENSEMBL_TRANSCRIPT_ID = + paste0( + "", + .data$ENSEMBL_TRANSCRIPT_ID,"")) |> dplyr::select(-.data$VAR_ID) |> + dplyr::rename(CONSEQUENCE_ALTERNATE = .data$VEP_ALL_CSQ) |> dplyr::select(.data$SYMBOL, .data$CONSEQUENCE, .data$PROTEIN_CHANGE, @@ -1637,7 +1648,11 @@ onco_enrich <- function(query = NULL, .data$LOSS_OF_FUNCTION, .data$ENSEMBL_GENE_ID, .data$ENSEMBL_TRANSCRIPT_ID, - dplyr::everything()) + .data$PRIMARY_SITE, + .data$SITE_RECURRENCE, + .data$TOTAL_RECURRENCE, + .data$COSMIC_MUTATION_ID, + .data$CONSEQUENCE_ALTERNATE) } for(psite in names(onc_rep[["data"]][["tcga"]][["aberration"]][["table"]][["snv_indel"]])){ @@ -2106,6 +2121,7 @@ write <- function(report, "subcellcomp", "cell_tissue", "aberration", + "recurrent_variants", "coexpression", "prognostic_association_I", "prognostic_association_II" @@ -2115,6 +2131,10 @@ write <- function(report, if(elem == "cancer_association"){ show_elem <- "disease" } + + if(elem == "recurrent_variants"){ + show_elem <- "aberration" + } if(elem == "prognostic_association_I"){ show_elem <- "cancer_prognosis" } diff --git a/R/sysdata.rda b/R/sysdata.rda index 57410bc..2f56bd2 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/utils.R b/R/utils.R index 0aeeb9d..339a110 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1077,11 +1077,11 @@ add_excel_sheet <- function( ) ) |> dplyr::mutate( - targeted_cancer_drugs_lp = + drugs_late_phase = stringr::str_replace_all( stringr::str_squish( stringr::str_trim( - textclean::replace_html(.data$targeted_cancer_drugs_lp) + textclean::replace_html(.data$drugs_late_phase) ) ), " , ", @@ -1089,11 +1089,11 @@ add_excel_sheet <- function( ) ) |> dplyr::mutate( - targeted_cancer_drugs_ep = + drugs_early_phase = stringr::str_replace_all( stringr::str_squish( stringr::str_trim( - textclean::replace_html(.data$targeted_cancer_drugs_ep) + textclean::replace_html(.data$drugs_early_phase) ) ), " , ", @@ -1420,6 +1420,78 @@ add_excel_sheet <- function( } + if(analysis_output == "recurrent_variants"){ + + if(is.data.frame(report$data$tcga$recurrent_variants)){ + if(NROW(report$data$tcga$recurrent_variants) > 0){ + df <- + report$data$tcga$recurrent_variants + + colnames(df) <- tolower(colnames(df)) + df <- as.data.frame( + df |> + dplyr::mutate( + site_recurrence = as.numeric(.data$site_recurrence) + ) |> + dplyr::arrange( + dplyr::desc(.data$total_recurrence), + dplyr::desc(.data$site_recurrence)) |> + dplyr::mutate( + annotation_source = report$config$resources$tcga$name, + version = report$config$resources$tcga$version) |> + dplyr::mutate( + ensembl_gene_id = + stringr::str_trim( + textclean::replace_html(.data$ensembl_gene_id) + ) + ) |> + dplyr::mutate( + ensembl_transcript_id = + stringr::str_trim( + textclean::replace_html(.data$ensembl_transcript_id) + ) + ) |> + dplyr::mutate( + protein_domain = + stringr::str_trim( + textclean::replace_html(.data$protein_domain) + ) + ) |> + dplyr::mutate( + cosmic_mutation_id = + stringr::str_trim( + textclean::replace_html(.data$cosmic_mutation_id) + ) + ) |> + dplyr::mutate( + site_recurrence = paste(.data$primary_site, + .data$site_recurrence, sep=":") + ) |> + dplyr::group_by( + .data$symbol, .data$consequence, + .data$protein_change, .data$protein_domain, + .data$mutation_hotspot, + .data$loss_of_function, + .data$ensembl_gene_id, + .data$ensembl_transcript_id, + .data$total_recurrence, + .data$cosmic_mutation_id + ) |> + dplyr::summarise(site_recurrence = paste( + .data$site_recurrence, collapse=", " + ), .groups = "drop") |> + dplyr::arrange( + dplyr::desc(.data$total_recurrence) + ) + + ) + + target_df <- target_df |> + dplyr::bind_rows(df) + } + } + } + if(analysis_output == "aberration"){ ## cna aberrations diff --git a/README.md b/README.md index d2df8b6..99fd25d 100644 --- a/README.md +++ b/README.md @@ -20,13 +20,14 @@ Web-based access to **oncoEnrichR** is available at [**https://oncotools.elixir. ## News +- September 23rd 2022: [**1.3.1 release**](https://sigven.github.io/oncoEnrichR/articles/CHANGELOG.html#version-1-3-1) - September 12th 2022: [**1.3.0 release**](https://sigven.github.io/oncoEnrichR/articles/CHANGELOG.html#version-1-3-0) - September 2nd 2022: [**1.2.2 release**](https://sigven.github.io/oncoEnrichR/articles/CHANGELOG.html#version-1-2-2) - July 13th 2022: [**1.2.1 release**](https://sigven.github.io/oncoEnrichR/articles/CHANGELOG.html#version-1-2-1) ## Example report -DOI +DOI ### Contact @@ -34,7 +35,7 @@ sigven AT ifi.uio.no ### Funding and Collaboration -OncoEnrichR is supported by the [Centre for Cancer Cell Reprogramming](https://www.med.uio.no/cancell/english/) at the [University of Oslo](https://www.uio.no)/[Oslo University Hospital](https://radium.no), and [Elixir Norway (Oslo node)](https://elixir.no/organization/organisation/elixir-uio). +oncoEnrichR is supported by the [Centre for Cancer Cell Reprogramming](https://www.med.uio.no/cancell/english/) at the [University of Oslo](https://www.uio.no)/[Oslo University Hospital](https://radium.no), and [Elixir Norway (Oslo node)](https://elixir.no/organization/organisation/elixir-uio).

diff --git a/data_processing_code/RELEASE_NOTES.txt b/data_processing_code/RELEASE_NOTES.txt index c6d2c81..956cd94 100644 --- a/data_processing_code/RELEASE_NOTES.txt +++ b/data_processing_code/RELEASE_NOTES.txt @@ -1,26 +1,26 @@ -##ONCOENRICHR_DB_VERSION = 20220910 -oncoEnrichR https://gihtub.com/sigven/oncoEnrichR R package for functional interrogation of genesets in the context of cancer v1.3.0 oncoEnrichR software +##ONCOENRICHR_DB_VERSION = 20220921 +oncoEnrichR https://gihtub.com/sigven/oncoEnrichR R package for functional interrogation of genesets in the context of cancer v1.3.1 oncoEnrichR software Omnipath https://omnipathdb.org/ Database of molecular biology prior knowledge: gene regulatory interactions, enzyme-PTM relationships, protein complexes, protein annotations etc. v3.4.0/OmnipathR omnipath db hu.MAP http://humap2.proteincomplexes.org/ Human Protein Complex Map v2.0 humap2 db dorothea https://saezlab.github.io/dorothea/ Gene set resource containing signed transcription factor (TF) - target interactions v1.8.0 dorothea db tissueEnrich https://www.bioconductor.org/packages/release/bioc/vignettes/TissueEnrich/inst/doc/TissueEnrich.html R package used to calculate enrichment of tissue-specific genes in a set of input genes v1.16.0 tissueenrich software -oncoPhenoMap https://github.com/sigven/oncoPhenoMap Crossmapped phenotype ontologies for the oncology domain v0.3.8 oncophenomap software +oncoPhenoMap https://github.com/sigven/oncoPhenoMap Crossmapped phenotype ontologies for the oncology domain v0.4.0 oncophenomap software STRING https://string-db.org Protein-protein interaction database v11.5 string db GENCODE https://www.gencodegenes.org/ High quality reference gene annotation and experimental validation v41 gencode db -TCGA https://cancergenome.nih.gov The Cancer Genome Atlas - Tumor gene expression and somatic DNA aberrations v32.0 (March 29th 2022) tcga db +TCGA https://cancergenome.nih.gov The Cancer Genome Atlas - Tumor gene expression and somatic DNA aberrations v34.0 (July 27th 2022) tcga db UniProtKB http://www.uniprot.org Comprehensive resource of protein sequence and functional information v2022_03 uniprot db NetPath http://www.netpath.org Manually curated resource of signal transduction pathways in humans v1 (2010) netpath db -EFO https://github.com/EBISPOT/efo Experimental Factor Ontology v3.43.0 efo db -DiseaseOntology https://github.com/DiseaseOntology Human Disease Ontology 2022-06-07 do db +EFO https://github.com/EBISPOT/efo Experimental Factor Ontology v3.46.0 efo db +DiseaseOntology https://github.com/DiseaseOntology Human Disease Ontology 2022-08-29 do db COMPPI https://comppi.linkgroup.hu/ Compartmentalized protein-protein interaction database v2.1.1 (October 2018) comppi db -WikiPathways https://www.wikipathways.org A database of biological pathways maintained by and for the scientific community 20220810 wikipathway db +WikiPathways https://www.wikipathways.org A database of biological pathways maintained by and for the scientific community 20220910 wikipathway db MSigDB http://software.broadinstitute.org/gsea/msigdb/index.jsp Molecular Signatures Database - collection of annotated gene sets August 2022 (MSigDB v2022.1) msigdb db REACTOME https://reactome.org Manually curated and peer-reviewed pathway database v81 (MSigDB v2022.1) reactome db CellChatDB http://www.cellchat.org/cellchatdb/ Multimeric ligand-receptor complexes v1 (2021) cellchatdb db CellTalkDB http://tcm.zju.edu.cn/celltalkdb/ A manually curated database of literature-supported ligand-receptor interactions in human and mouse Nov 2020 celltalkdb db GeneOntology https://geneontology.org Knowledgebase that contains the largest structural source of information on the functions of genes August 2022 (MSigDB v2022.1) go db KEGG https://www.genome.jp/kegg/pathway.html Collection of manually drawn pathway maps representing our knowledge on the molecular interaction, reaction and relation networks 20220809 kegg db -CancerMine http://bionlp.bcgsc.ca/cancermine/ Literature-mined database of tumor suppressor genes/proto-oncogenes v47 - 20220708 cancermine db +CancerMine http://bionlp.bcgsc.ca/cancermine/ Literature-mined database of tumor suppressor genes/proto-oncogenes v48 - 20220920 cancermine db NCG http://ncg.kcl.ac.uk/index.php Network of cancer genes - a web resource to analyze duplicability, orthology and network properties of cancer genes v7.0 ncg db CGC https://cancer.sanger.ac.uk/census Cancer Gene Census v96 cgc db Pfam http://pfam.xfam.org Collection of protein families/domains 2021_11 (v35.0) pfam db diff --git a/data_processing_code/data_raw.R b/data_processing_code/data_raw.R index 7e1899c..91f54b3 100644 --- a/data_processing_code/data_raw.R +++ b/data_processing_code/data_raw.R @@ -4,7 +4,7 @@ library(gganatogram) source('data_processing_code/data_utility_functions.R') msigdb_version <- '2022.1' -wikipathways_version <- "20220810" +wikipathways_version <- "20220910" netpath_version <- "2010" opentargets_version <- "2022.06" kegg_version <- "20220809" @@ -14,17 +14,17 @@ uniprot_release <- "2022_03" ## Which databases to update or retrieve from last updated state update_omnipathdb <- F update_hpa <- F -update_ncbi_gene_summary <- T +update_ncbi_gene_summary <- F update_project_score <- F update_project_survival <- F -update_tcga <- F +update_tcga <- T update_cancer_hallmarks <- F update_omnipath_regulatory <- F update_omnipath_complexdb <- F update_gencode <- F update_ligand_receptor_db <- T -oe_version <- "1.3.0" +oe_version <- "1.3.1" data_raw_dir <- "/Users/sigven/project_data/package__oncoEnrichR/db/raw" data_output_dir <- "/Users/sigven/project_data/package__oncoEnrichR/db/output" @@ -99,7 +99,7 @@ ts_oncogene_annotations <- get_ts_oncogene_annotations( raw_db_dir = data_raw_dir, gene_info = gene_info, - version = "47") |> + version = "48") |> dplyr::select( entrezgene, tumor_suppressor, oncogene, citation_links_oncogene, diff --git a/data_processing_code/data_utility_functions.R b/data_processing_code/data_utility_functions.R index 49a6ae0..dc5f5bf 100644 --- a/data_processing_code/data_utility_functions.R +++ b/data_processing_code/data_utility_functions.R @@ -3868,6 +3868,8 @@ get_tcga_db <- function( gene_xref = NULL, update = F){ + tcga_release <- 'release34_20220727' + rds_fname <- file.path( raw_db_dir, "tcga", @@ -3876,7 +3878,10 @@ get_tcga_db <- function( coexpression_tsv <- file.path( raw_db_dir, "tcga", - "co_expression_strong_moderate.release32_20220329.tsv.gz") + paste0( + "co_expression_strong_moderate.", + tcga_release, + ".tsv.gz")) tcga_clinical_rds <- file.path( raw_db_dir, @@ -3970,16 +3975,22 @@ get_tcga_db <- function( primary_site <- maf_codes[i,]$primary_site maf_code <- maf_codes[i,]$code maf_file <- file.path( - maf_path, paste0("tcga_mutation_grch38_release32_20220329.",maf_code,"_0.maf.gz")) + maf_path, paste0( + "tcga_mutation_grch38_", + tcga_release, + ".", + maf_code,"_0.maf.gz")) if(file.exists(maf_file)){ tmp <- read.table(gzfile(maf_file), quote="", header = T, stringsAsFactors = F, sep="\t", comment.char="#") tmp$primary_site <- NULL tmp$site_diagnosis_code <- NULL - tmp$Tumor_Sample_Barcode <- stringr::str_replace(tmp$Tumor_Sample_Barcode,"-[0-9][0-9][A-Z]$","") + tmp$Tumor_Sample_Barcode <- stringr::str_replace( + tmp$Tumor_Sample_Barcode,"-[0-9][0-9][A-Z]$","") - clinical <- tcga_clinical |> dplyr::filter(primary_site == primary_site) |> + clinical <- tcga_clinical |> + dplyr::filter(primary_site == primary_site) |> dplyr::select(bcr_patient_barcode, primary_diagnosis_very_simplified, MSI_status, Gleason_score, ER_status, PR_status, HER2_status, @@ -4029,7 +4040,9 @@ get_tcga_db <- function( ENSEMBL_TRANSCRIPT_ID, SYMBOL, COSMIC_MUTATION_ID, - Consequence) |> + Consequence, + AMINO_ACID_START, + VEP_ALL_CSQ) |> dplyr::mutate(VAR_ID = paste( CHROM, POS, REF, ALT, sep = "_") ) |> @@ -4047,13 +4060,15 @@ get_tcga_db <- function( VAR_ID, CONSEQUENCE, PROTEIN_CHANGE, + AMINO_ACID_START, PFAM_ID, MUTATION_HOTSPOT, LOSS_OF_FUNCTION, ENSEMBL_TRANSCRIPT_ID, COSMIC_MUTATION_ID, TCGA_SITE_RECURRENCE, - TOTAL_RECURRENCE) |> + TOTAL_RECURRENCE, + VEP_ALL_CSQ) |> tidyr::separate_rows(TCGA_SITE_RECURRENCE, sep=",") |> tidyr::separate(TCGA_SITE_RECURRENCE, into = c("PRIMARY_SITE","SITE_RECURRENCE", "TCGA_SAMPLES"), @@ -4077,6 +4092,81 @@ get_tcga_db <- function( CONSEQUENCE,"^(intron|intergenic|mature|non_coding|synonymous|upstream|downstream|3_prime|5_prime)")) ) + csq_all_fixed <- as.data.frame( + recurrent_tcga_variants |> + dplyr::select(VAR_ID, VEP_ALL_CSQ) |> + tidyr::separate_rows(VEP_ALL_CSQ, sep=",") |> + tidyr::separate( + VEP_ALL_CSQ, c("V1","V2","V3","V4", + "V5","V6","V7","V8","V9"), + sep = ":") |> + dplyr::mutate(VEP_ALL_CSQ = paste( + V1,V2,V5,V6, sep=":" + )) |> + dplyr::group_by(VAR_ID) |> + dplyr::summarise(VEP_ALL_CSQ = paste( + unique(VEP_ALL_CSQ), collapse=", " + ), .groups = "drop") + ) + + + hotspots_fixed <- recurrent_tcga_variants |> + dplyr::filter(!is.na(MUTATION_HOTSPOT)) |> + tidyr::separate_rows(MUTATION_HOTSPOT, sep="&") |> + dplyr::select(SYMBOL, PROTEIN_CHANGE, VAR_ID, MUTATION_HOTSPOT) |> + tidyr::separate(MUTATION_HOTSPOT, c("genesym","aapos","altaa","qvalue"), + sep = "\\|", remove = F) |> + dplyr::mutate(hs_hgvsp = paste0( + "p.",aapos,altaa + )) |> + dplyr::mutate(mismatch = dplyr::if_else( + PROTEIN_CHANGE != hs_hgvsp & + nchar(altaa) > 0 & + !(stringr::str_detect( + PROTEIN_CHANGE,"\\?")), + TRUE, + FALSE + )) |> + dplyr::filter(mismatch == T) |> + dplyr::mutate( + MUTATION_HOTSPOT_FIXED = paste0( + genesym, "|", + aapos,"/", + stringr::str_replace_all( + PROTEIN_CHANGE,"p\\.|[A-Z]$","" + ), "|", + altaa, "|", qvalue + ) + ) |> + dplyr::select( + SYMBOL, PROTEIN_CHANGE, + VAR_ID, MUTATION_HOTSPOT_FIXED + ) |> + dplyr::distinct() + + recurrent_tcga_variants$VEP_ALL_CSQ <- NULL + recurrent_tcga_variants <- recurrent_tcga_variants |> + dplyr::left_join(csq_all_fixed, by = "VAR_ID") |> + dplyr::left_join( + hotspots_fixed, + by = c("SYMBOL","PROTEIN_CHANGE","VAR_ID")) |> + dplyr::mutate(MUTATION_HOTSPOT = dplyr::if_else( + !is.na(MUTATION_HOTSPOT_FIXED), + as.character(MUTATION_HOTSPOT_FIXED), + as.character(MUTATION_HOTSPOT) + )) |> + dplyr::select(-MUTATION_HOTSPOT_FIXED) |> + dplyr::select( + SYMBOL, VAR_ID, CONSEQUENCE, + PROTEIN_CHANGE, MUTATION_HOTSPOT, + AMINO_ACID_START, PFAM_ID, + LOSS_OF_FUNCTION, ENSEMBL_TRANSCRIPT_ID, + COSMIC_MUTATION_ID, PRIMARY_SITE, + SITE_RECURRENCE, TOTAL_RECURRENCE, + VEP_ALL_CSQ + ) + + ##TCGA co-expression data raw_coexpression <- readr::read_tsv(coexpression_tsv, @@ -4115,7 +4205,6 @@ get_tcga_db <- function( gdc_projects <- as.data.frame(TCGAbiolinks::getGDCprojects()) |> dplyr::filter(is.na(dbgap_accession_number) & startsWith(id,"TCGA")) - tcga_release <- 'release32_20220329' all_tcga_mean_median_tpm <- data.frame() @@ -4123,8 +4212,8 @@ get_tcga_db <- function( tumor_code <- gdc_projects[i,]$tumor rnaseq_rds_fname <- file.path("","Users","sigven", "project_data","analysis__tcga", - "tcga", "data", "rnaseq", - paste0("tcga_rnaseq_",tumor_code,"_", + "tcga", "output", "rnaseq", + paste0("rnaseq_",tumor_code,"_", tcga_release,".rds")) cat(tumor_code, '\n') diff --git a/docker/Dockerfile b/docker/Dockerfile index 7fd1590..782ed75 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -14,7 +14,7 @@ RUN sudo apt update -qq # install two helper packages we need RUN sudo apt-get update && sudo apt-get -y install --no-install-recommends software-properties-common dirmngr # add the signing key (by Michael Rutter) for these repos -# To verify key, run gpg --show-keys /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc +# To verify key, run gpg --show-keys /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc # Fingerprint: E298A3A825C0D65DFD57CBB651716619E084DAB9 RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc # add the R 4.0 repo from CRAN -- adjust 'focal' to 'groovy' or 'bionic' as needed @@ -45,7 +45,7 @@ RUN cd cmake-3.20.0 && ./bootstrap && make && make install ## PINNED versions of all packages can be found in /renv.lock (JSON format) RUN R -e "install.packages(repos = NULL, 'https://cloud.r-project.org/src/contrib/renv_0.15.5.tar.gz', lib = '/usr/lib/R/library/')" COPY renv.lock / -COPY oncoEnrichR_1.3.0.tar.gz / +COPY oncoEnrichR_1.3.1.tar.gz / #RUN R -e "library(renv)" @@ -53,7 +53,7 @@ RUN R -e "library(renv); renv::restore(prompt = F, library = '/usr/lib/R/library WORKDIR / ## PINNED version - oncoEnrichR -RUN R -e "install.packages(repos = NULL, 'oncoEnrichR_1.3.0.tar.gz', lib = '/usr/lib/R/library/')" +RUN R -e "install.packages(repos = NULL, 'oncoEnrichR_1.3.1.tar.gz', lib = '/usr/lib/R/library/')" ## PINNED VERSION: pandoc (for HTML report generation) RUN wget https://github.com/jgm/pandoc/releases/download/2.13/pandoc-2.13-1-amd64.deb && \ @@ -62,5 +62,5 @@ RUN wget https://github.com/jgm/pandoc/releases/download/2.13/pandoc-2.13-1-amd6 apt-get clean RUN rm -rf /root/.cache -RUN rm -rf /oncoEnrichR_1.3.0.tar.gz +RUN rm -rf /oncoEnrichR_1.3.1.tar.gz RUN rm -rf /cmake-3.20.0* diff --git a/docker/oncoenrichr_wrapper.xml b/docker/oncoenrichr_wrapper.xml index ff3dc3f..89938a7 100755 --- a/docker/oncoenrichr_wrapper.xml +++ b/docker/oncoenrichr_wrapper.xml @@ -1,7 +1,7 @@ - + Cancer-dedicated gene set interpretation - sigven/oncoenrichr:1.2.2 + sigven/oncoenrichr:1.3.1
') * Browse recurrent, protein-coding somatic SNVs/InDels from TCGA in the query set * Variants are listed as one record _per tissue/site_, in effect making the same variant occurring in multiple rows * Only variants with a site-specific frequency >= 2 are shown - * Variants can be filtered based on various properties, e.g. __Site/tissue variant recurrence__, as well as overall recurrence across all tumor sites (column __Pancancer variant recurrence__) + * Variants can be filtered based on various properties, e.g. __Site/tissue variant recurrence__, as well as overall recurrence across all tumor sites (column __Pan-cancer variant recurrence__) * Notably, each variant have been annotated/classified with a 1. _loss-of-function_ status, based on the [LOFTEE plugin in VEP](https://github.com/konradjk/loftee) - 2. Status as  somatic mutation hotspots  in cancer, according to [cancerhotspots.org](https://cancerhotspots.org). Format: _\|\|\_ + 2. Status as  somatic mutation hotspots  in cancer, according to [cancerhotspots.org](https://cancerhotspots.org). Format: _\|\|\|\_ * Top 2,500 recurrent variants are listed here (all variants are listed in the Excel output of *oncoEnrichR*)

@@ -711,7 +711,9 @@ mutation_hotspot_levels <- levels(as.factor( vars <- vars |> - dplyr::arrange(dplyr::desc(SITE_RECURRENCE), dplyr::desc(LOSS_OF_FUNCTION)) |> + dplyr::arrange( + dplyr::desc(SITE_RECURRENCE), + dplyr::desc(LOSS_OF_FUNCTION)) |> dplyr::mutate(CONSEQUENCE = stringr::str_replace_all( CONSEQUENCE, "&", ", " )) |> @@ -784,7 +786,7 @@ cat('

') 1.   sCNA - amplifications   2.   sCNA - homozygous deletions   * The values in the heatmaps reflect the percent of all tumor samples pr. primary site with the gene amplified/lost (**percent_mutated**) - * Genes in the heatmap are ranked according to alteration frequency across all sites (i.e. _pancancer_), limited to the top 75 genes in the query set + * Genes in the heatmap are ranked according to alteration frequency across all sites (i.e. _pan-cancer_), limited to the top 75 genes in the query set * Frequencies across all subtypes per primary site are listed in an interactive table * Only including genes that are aberrant in >= 1 percent of samples for a given tumor type/subtype * Limited here to top 2,500 gene-subtype frequencies (all aberration frequencies are listed in the Excel output of *oncoEnrichR*) diff --git a/inst/templates/_documentation_workflow.Rmd b/inst/templates/_documentation_workflow.Rmd index d5d0307..666f568 100644 --- a/inst/templates/_documentation_workflow.Rmd +++ b/inst/templates/_documentation_workflow.Rmd @@ -30,7 +30,7 @@ The configurations set for this oncoEnrichR report is outlined below, in additio * `show_aberration` = __`r onc_enrich_report[['config']][['show']][['aberration']]`__ * `show_prognostic` = __`r onc_enrich_report[['config']][['show']][['cancer_prognosis']]`__ -* Cancer assocations: +* Cancer associations: * `show_top_diseases_only` = __`r onc_enrich_report[['config']][['disease']][['show_top_diseases']]`__ * Regulatory interactions: diff --git a/inst/templates/_drug_target_association.Rmd b/inst/templates/_drug_target_association.Rmd index 25b3c9a..9441ed3 100644 --- a/inst/templates/_drug_target_association.Rmd +++ b/inst/templates/_drug_target_association.Rmd @@ -2,7 +2,10 @@ * Each protein/protein in the query set is annotated with: * Targeted drugs (inhibitors/antagonists), as found through the [Open Targets Platform](https://targetvalidation.org), limited to compounds indicated for a cancer condition/phenotype - * We distinguish between drugs in early clinical development/phase (ep), and drugs already in late clinical development/phase (lp) + * Drugs are organized into the following buckets: + * Targeted cancer drugs in early clinical development/phase (phase 1-2)(column drugs_early_phase) + * Targeted cancer drugs in late clinical development/phase (phase 3-4) (column drugs_late_phase) + * Approved drugs - here also showing the (cancer type) indications for which the drugs are approved for

diff --git a/inst/templates/_fitness_lof.Rmd b/inst/templates/_fitness_lof.Rmd index 597ae15..395b4ed 100644 --- a/inst/templates/_fitness_lof.Rmd +++ b/inst/templates/_fitness_lof.Rmd @@ -121,7 +121,7 @@ cat('

') #### Target priority scores -* Promising candidate therapeutic targets are indicated through __[target priority scores](https://score.depmap.sanger.ac.uk/documentation#scores)__. Target priority scores are based on integration of CRISPR knockout gene fitness effects with genomic biomarker and patient data ([Behan et al., Nature, 2019](https://pubmed.ncbi.nlm.nih.gov/30971826/)). All genes are assigned a target priority score between 0 – 100 from lowest to highest priority. In the heatmap shown below, genes in the query set are ranked according to their respective priority scores across all cancers (i.e. _Pancancer_), limited to the top 100 candidates. +* Promising candidate therapeutic targets are indicated through __[target priority scores](https://score.depmap.sanger.ac.uk/documentation#scores)__. Target priority scores are based on integration of CRISPR knockout gene fitness effects with genomic biomarker and patient data ([Behan et al., Nature, 2019](https://pubmed.ncbi.nlm.nih.gov/30971826/)). All genes are assigned a target priority score between 0 – 100 from lowest to highest priority. In the heatmap shown below, genes in the query set are ranked according to their respective priority scores across all cancers (i.e. _Pan-cancer_), limited to the top 100 candidates.
diff --git a/inst/templates/_functional_enrichment.Rmd b/inst/templates/_functional_enrichment.Rmd index eb2ecdc..7f6001c 100644 --- a/inst/templates/_functional_enrichment.Rmd +++ b/inst/templates/_functional_enrichment.Rmd @@ -1,6 +1,6 @@ ### Function and pathway enrichment -* The query set is analyzed with [clusterProfiler](https://bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.html) for functional enrichment/overrepresentation with respect to: +* The query set is analyzed with [clusterProfiler](https://bioconductor.org/packages/release/bioc/vignettes/clusterProfiler/inst/doc/clusterProfiler.html) for functional enrichment/over-representation with respect to: * [Gene Ontology terms](https://geneontology.org). All three subontologies: _Molecular Function_ (GO_MF), _Cellular Component_ (GO_CC) & _Biological Process_ (GO_BP) * Molecular signalling networks from [KEGG](https://www.genome.jp/kegg/pathway.html) * Cellular pathways from [Reactome](https://reactome.org/), and other curated gene signature sets from the [Molecular Signatures Database (MSiGDB)](http://software.broadinstitute.org/gsea/msigdb/index.jsp) @@ -9,7 +9,7 @@
-* Enrichment/overrepresentation test settings (clusterProfiler) +* Enrichment/over-representation test settings (_clusterProfiler_) * P-value cutoff: __`r onc_enrich_report[['config']][['enrichment']][['p_value_cutoff']]`__ * Q-value cutoff: __`r onc_enrich_report[['config']][['enrichment']][['q_value_cutoff']]`__ * Correction for multiple testing: __`r onc_enrich_report[['config']][['enrichment']][['p_adjust_method']]`__ @@ -481,15 +481,11 @@ plot_data <- onc_enrich_report[['data']][['enrichment']][['go']] |> plot_data$description <- factor(plot_data$description, levels = plot_data$description) -#max_y <- max(plyr::round_any(-log10(plot_data$qvalue), 10, f = ceiling)) max_y <- max(plyr::round_any(plot_data$enrichment_factor, 10, f = ceiling)) p <- ggplot2::ggplot( plot_data, - #ggplot2::aes( x = description, y = -log10(qvalue), fill = Subontology) ) + ggplot2::aes( x = description, y = enrichment_factor, fill = qvalue) ) + - #ggplot2::scale_fill_gradient(low = "yellow", high = "red", midpoint = midpoint_qvalue, na.value = NA) + - #ggplot2::scale_fill_brewer(palette = "Dark2") + ggplot2::geom_bar( stat = "identity" ) + ggplot2::xlab("") + ggplot2::ylab("Enrichment") + @@ -497,7 +493,6 @@ p <- ggplot2::ggplot( ggplot2::theme_classic() + ggplot2::coord_flip() + ggplot2::theme( - #legend.position = "none", legend.title = ggplot2::element_blank(), axis.text.x = ggplot2::element_text(size = plot_fontsize, vjust = 0.5), axis.text.y = ggplot2::element_text(family = "Helvetica", size = plot_fontsize), diff --git a/inst/templates/_ligand_receptor.Rmd b/inst/templates/_ligand_receptor.Rmd index 7b7f087..1cbdc07 100644 --- a/inst/templates/_ligand_receptor.Rmd +++ b/inst/templates/_ligand_receptor.Rmd @@ -2,7 +2,7 @@
-* Using data from the [CellChatDB](http://www.cellchat.org/) resource, we are here interrogating ligand-receptor interactions for members of the query set. Putative interactions are displayed along three different axes with respect to cell-cell comunication: +* Using data from the [CellChatDB](http://www.cellchat.org/) resource, we are here interrogating ligand-receptor interactions for members of the query set. Putative interactions are displayed along three different axes with respect to cell-cell communication: 1) Secreted Signaling (Paracrine/autocrine signaling) 2) ECM-Receptor (extracellular matrix-receptor interactions) diff --git a/pkgdown/index.md b/pkgdown/index.md index 0e14eaf..de4a959 100644 --- a/pkgdown/index.md +++ b/pkgdown/index.md @@ -29,7 +29,7 @@ Web-based access to **oncoEnrichR** is available at

-## Questions adressed by oncoEnrichR +## Questions addressed by oncoEnrichR The contents of the analysis report provided by oncoEnrichR address the following scientific questions for a given gene list: @@ -38,7 +38,7 @@ following scientific questions for a given gene list: the query set, and to what extent? - Which genes in the query set are attributed with cancer hallmark evidence? -- Which proteins in the query sets are druggable in diffferent cancer +- Which proteins in the query sets are druggable in different cancer conditions (early and late clinical development phases)? For other proteins in the query set, what is their likelihood of being druggable? @@ -85,6 +85,8 @@ See also the [output views](articles/output.html) that addresses each of the que ## News +- September 23rd 2022: [**1.3.1 + release**](articles/CHANGELOG.html#version-1-3-1) - September 12th 2022: [**1.3.0 release**](articles/CHANGELOG.html#version-1-3-0) - September 2nd 2022: [**1.2.2 @@ -94,7 +96,7 @@ See also the [output views](articles/output.html) that addresses each of the que ## Example report -DOI +DOI
diff --git a/tests/testthat/test_disease_drug.R b/tests/testthat/test_disease_drug.R index 0cee065..f022ed9 100644 --- a/tests/testthat/test_disease_drug.R +++ b/tests/testthat/test_disease_drug.R @@ -36,8 +36,8 @@ test_that("Target drug associations", { qgenes = c("EGFR","BRAF"), genedb = oedb$genedb$all)$target_drugs), c("symbol","genename", - "targeted_cancer_drugs_lp", - "targeted_cancer_drugs_ep", + "drugs_late_phase", + "drugs_early_phase", "approved_drugs") ) diff --git a/vignettes/CHANGELOG.Rmd b/vignettes/CHANGELOG.Rmd index e477cda..46df9c9 100644 --- a/vignettes/CHANGELOG.Rmd +++ b/vignettes/CHANGELOG.Rmd @@ -7,6 +7,33 @@ vignette: > %\usepackage[UTF-8]{inputenc} --- +## Version 1.3.1 + +* Date: 2022-09-23 + +### Added + +* Data updates: + * CancerMine (20220920 - v48) + * TCGA (20220727 - v34.0) + * WikiPathways (20220910) + * Upgraded [sigven/oncoPhenoMap](https://github.com/sigven/oncoPhenoMap) to + v0.4.0: + * Experimental Factor Ontology (v3.46.0) + * Human Disease Ontology (2022-08-29) + +* Recurrent somatic variants (SNVs/InDels, as found in TCGA) are appended to +Excel output worksheet, tab `RECURRENT_VARIANTS` + +### Fixed + +* A few erroneous mutation hotspots in `Tumor aberration frequencies` section + +### Changed + +* Slight modification to column names in `Drug associations` section +* Added links to `ENSEMBL_GENE_ID` in `Tumor aberration frequencies` section + ## Version 1.3.0 * Date: 2022-09-12 diff --git a/vignettes/running.Rmd b/vignettes/running.Rmd index 97159c7..518b75c 100644 --- a/vignettes/running.Rmd +++ b/vignettes/running.Rmd @@ -102,7 +102,7 @@ A target list of *n = 134* high-confidence interacting proteins with the c-MYC o - `project_title = "cMYC_BioID_screen"` - `project_owner = "Raught et al."` -and produced the [following HTML report with results](https://doi.org/10.5281/zenodo.7042674). +and produced the [following HTML report with results](https://doi.org/10.5281/zenodo.7104355). Below are R commands provided to reproduce the example output. **NOTE**: Replace "LOCAL_FOLDER" with a directory on your local computer: