Merge pull request #257 from qbic-pipelines/dev

Release 2.5
qbic-pipelines · Dec 13, 2024 · e53bbae · e53bbae
2 parents ffbf84a + 64d0e3d
commit e53bbae
Show file tree

Hide file tree

Showing 14 changed files with 124 additions and 47 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -39,14 +39,14 @@ jobs:
             environment.yml
       - name: Build new docker image
         if: env.MATCHED_FILES
-        run: docker build --no-cache . -t ghcr.io/qbic-pipelines/rnadeseq:2.4
+        run: docker build --no-cache . -t ghcr.io/qbic-pipelines/rnadeseq:2.5
 
       # Change the version above and the third version below before/after release
       - name: Pull docker image
         if: ${{ !env.MATCHED_FILES }}
         run: |
           docker pull ghcr.io/qbic-pipelines/rnadeseq:dev
-          docker tag ghcr.io/qbic-pipelines/rnadeseq:dev ghcr.io/qbic-pipelines/rnadeseq:2.4
+          docker tag ghcr.io/qbic-pipelines/rnadeseq:dev ghcr.io/qbic-pipelines/rnadeseq:2.5
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1
@@ -93,14 +93,14 @@ jobs:
             environment.yml
       - name: Build new docker image
         if: env.MATCHED_FILES
-        run: docker build --no-cache . -t ghcr.io/qbic-pipelines/rnadeseq:2.4
+        run: docker build --no-cache . -t ghcr.io/qbic-pipelines/rnadeseq:2.5
 
       # Change the version above and the third version below before/after release
       - name: Pull docker image
         if: ${{ !env.MATCHED_FILES }}
         run: |
           docker pull ghcr.io/qbic-pipelines/rnadeseq:dev
-          docker tag ghcr.io/qbic-pipelines/rnadeseq:dev ghcr.io/qbic-pipelines/rnadeseq:2.4
+          docker tag ghcr.io/qbic-pipelines/rnadeseq:dev ghcr.io/qbic-pipelines/rnadeseq:2.5
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1
@@ -129,7 +129,7 @@ jobs:
 
       - name: Upload logs on failure
         if: failure()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: logs-${{ matrix.profile }}
           path: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,10 +3,31 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 2.5 - The Potato Eaters
+
+### Added
+
+- [#256](https://github.com/qbic-pipelines/rnadeseq/pull/256) Add trycatch to pathway enrichment plots so they are skipped when too large instead of throwing an error
+- [#255](https://github.com/qbic-pipelines/rnadeseq/pull/255) Add usage docu for datasources, heatmaps_cluster_rows/cols and pathway_adj_pval_threshold params
+- [#251](https://github.com/qbic-pipelines/rnadeseq/pull/251) Get raw gene count tables from either Salmon and RSEM analysis
+- [#250](https://github.com/qbic-pipelines/rnadeseq/pull/250) Added clearer error message for incorrect contrast_pairs
+
+### Changed
+
+- [#260](https://github.com/qbic-pipelines/rnadeseq/pull/260) Release 2.5
+- [#259](https://github.com/qbic-pipelines/rnadeseq/pull/259) Bump versions for release 2.5
+
+### Fixed
+
+- [#258](https://github.com/qbic-pipelines/rnadeseq/pull/258) Fixed some comments for release (removed excess checks for pathway_adj_pval_threshold, added default explanation of that param to Execute_report.R, fixed some whitespace)
+- [#252](https://github.com/qbic-pipelines/rnadeseq/pull/252) Fixed github CI bug by updating actions/upload-artifact
+- [#250](https://github.com/qbic-pipelines/rnadeseq/pull/250) Fixed incorrect reading and indexing of contrast_pairs
+
 ## 2.4 - A Pair of Shoes
 
 ### Added
 
+- [#253](https://github.com/qbic-pipelines/rnadeseq/pull/253) Added separate param for adjusted p-value threshold for gprofiler
 - [#245](https://github.com/qbic-pipelines/rnadeseq/pull/245) Added background gene list to pathway analysis output
 
 ### Changed

diff --git a/Dockerfile b/Dockerfile
@@ -5,14 +5,14 @@ LABEL org.opencontainers.image.authors="Gisela Gabernet, Alexander Peltzer, Oska
 LABEL org.opencontainers.image.licenses=MIT
 COPY environment.yml /
 #RUN conda install -c conda-forge mamba
-RUN mamba env create --file /environment.yml -p /opt/conda/envs/qbic-pipelines-rnadeseq-2.4 && \
+RUN mamba env create --file /environment.yml -p /opt/conda/envs/qbic-pipelines-rnadeseq-2.5 && \
     mamba clean --all --yes
 RUN apt-get update -qq && \
     apt-get install -y zip procps ghostscript
 # Add conda installation dir to PATH
-ENV PATH /opt/conda/envs/qbic-pipelines-rnadeseq-2.4/bin:$PATH
+ENV PATH /opt/conda/envs/qbic-pipelines-rnadeseq-2.5/bin:$PATH
 # Dump the details of the installed packates to a file for posterity
-RUN mamba env export --name qbic-pipelines-rnadeseq-2.4 > qbic-pipelines-rnadeseq-2.4.yml
+RUN mamba env export --name qbic-pipelines-rnadeseq-2.5 > qbic-pipelines-rnadeseq-2.5.yml
 # Instruct R processes to use these empty files instead of clashing with a local config
 RUN touch .Rprofile
 RUN touch .Renviron
diff --git a/assets/RNAseq_report.Rmd b/assets/RNAseq_report.Rmd
@@ -46,6 +46,7 @@ params:
     datasources: ''
     heatmaps_cluster_rows: ''
     heatmaps_cluster_cols: ''
+    pathway_adj_pval_threshold: ''
 
     #Additional args for the report
     path_proj_summary: ''
@@ -456,7 +457,7 @@ if (params$input_type %in% c("featurecounts", "smrnaseq")) {
 
     # Write raw counts to file
     count_table_names <- merge(x=gene_names, y=count.table, by.x = "Ensembl_ID", by.y="row.names")
-    write.table(count_table_names, paste("differential_gene_expression/gene_counts_tables/raw_gene_counts.tsv",sep=""), append = FALSE, quote = FALSE, sep = "\t",eol = "\n", na = "NA", dec = ".", row.names = F, qmethod = c("escape", "double"))
+    write.table(count_table_names, paste("differential_gene_expression/gene_counts_tables/raw_gene_counts.tsv", sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = F, qmethod = c("escape", "double"))
 }
 
 # to get all possible pairwise comparisons, make a combined factor
@@ -591,6 +592,10 @@ if (params$input_type %in% c("featurecounts", "smrnaseq")) {
 
         #dds from SummarizedExperiment <se>, then run DESeq
         cds <- DESeqDataSet(se, design = as.formula(eval(parse(text=as.character(design[[1]])))))
+        # get raw counts
+        count_table_names <- merge(x=gene_names, y=assay(cds), by.x = "Ensembl_ID", by.y="row.names")
+        count_table_names <- count_table_names[order(count_table_names$Ensembl_ID),]
+        write.table(count_table_names, paste("differential_gene_expression/gene_counts_tables/raw_gene_counts.tsv", sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = F, qmethod = c("escape", "double"))
 
 # Load salmon count files
     } else if (params$input_type == "salmon") {
@@ -624,6 +629,10 @@ if (params$input_type %in% c("featurecounts", "smrnaseq")) {
         coldata$combfactor <- metadata$combfactor
         rownames(coldata) <- qbicCodes
         cds <- DESeqDataSetFromTximport(txi=txi.salmon, colData =coldata, design = eval(parse(text=as.character(design[[1]]))))
+        # get raw counts
+        count_table_names <- merge(x=gene_names, y=assay(cds), by.x = "Ensembl_ID", by.y="row.names")
+        count_table_names <- count_table_names[order(count_table_names$Ensembl_ID),]
+        write.table(count_table_names, paste("differential_gene_expression/gene_counts_tables/raw_gene_counts.tsv", sep=""), append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = F, qmethod = c("escape", "double"))
     }
 } else {
     stop(paste0("Invalid input type: ", params$input_type, "! Input type must be one of: featurecounts, rsem, salmon, smrnaseq!"))
@@ -1526,19 +1535,21 @@ if (isProvided(params$path_contrast_list)) {
         contrast_names <- append(contrast_names, contname)
     }
 }
-
 if (isProvided(params$path_contrast_pairs)) {
-    contrasts <- read.table(path_contrast_pairs, sep="\t", header = T, colClasses = "character")
+    contrasts <- read.table(params$path_contrast_pairs, sep="\t", header = T, colClasses = "character")
     write.table(contrasts, file="differential_gene_expression/metadata/contrast_pairs.tsv", sep="\t", quote=F, col.names = T, row.names = F)
 
     # Contrast calculation for contrast pairs
     for (i in c(1:nrow(contrasts))) {
         cont <- as.character(contrasts[i,])
-        contname <- cont[0]
-        if (!(cont[2] %in% coefficients & cont[3] %in% coefficients)){
-            stop(paste("Provided contrast name is invalid, it needs to be contained in", coefficients))
+        contname <- cont[1]
+        if (!(cont[2] %in% coefficients)){
+            stop(paste0("Provided contrast name ", cont[2], " is invalid, it needs to be contained in ", paste(coefficients, collapse=", ")))
+        }
+        if (!(cont[3] %in% coefficients)){
+            stop(paste0("Provided contrast name ", cont[3], " is invalid, it needs to be contained in ", paste(coefficients, collapse=", ")))
         }
-        results_DEseq_contrast <- results(cds, contrast=list(cont[1],cont[2]))
+        results_DEseq_contrast <- results(cds, contrast=list(cont[2],cont[3]))
         results_DEseq_contrast <- as.data.frame(results_DEseq_contrast)
         print("Analyzing contrast:")
         print(contname)
@@ -2056,6 +2067,7 @@ if (!isProvided(params$datasources)) {
 # ------------------
 # Set default params
 # ------------------
+pathway_pval_text <- format(params$pathway_adj_pval_threshold, scientific=F)
 
 # Set theme for graphs
 theme_set(theme_classic())
@@ -2094,7 +2106,7 @@ for (file in contrast_files){
             correction_method="fdr",
             sources=datasources,
             evcodes=TRUE,
-            user_threshold=params$adj_pval_threshold,
+            user_threshold=params$pathway_adj_pval_threshold,
             custom_bg=custom_background,
             domain_scope="custom_annotated"
         )
@@ -2110,7 +2122,7 @@ for (file in contrast_files){
             correction_method="fdr",
             sources=datasources,
             evcodes=TRUE,
-            user_threshold=params$adj_pval_threshold,
+            user_threshold=params$pathway_adj_pval_threshold,
             domain_scope="annotated"
         )
         pathway_gostres_nobg <- gostres_nobg$result
@@ -2124,7 +2136,7 @@ for (file in contrast_files){
             correction_method="fdr",
             sources=datasources,
             evcodes=TRUE,
-            user_threshold=params$adj_pval_threshold,
+            user_threshold=params$pathway_adj_pval_threshold,
             domain_scope="annotated"
         )
     }
@@ -2214,9 +2226,32 @@ for (file in contrast_files){
                 scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
                 ggtitle("Enriched pathways") +
                 xlab("") + ylab("Gene fraction (DE genes / Pathway size)")
-            ggsave(p, filename = paste0("pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.pdf"), device = "pdf", height = 5+0.5*nrow(df_subset), units = "cm", limitsize=F)
-            ggsave(p, filename = paste0("pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.png"), device = "png", height = 5+0.5*nrow(df_subset), units = "cm", limitsize=F)
-            ggsave(p, filename = paste0("pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.svg"), device = "svg", height = 5+0.5*nrow(df_subset), units = "cm", limitsize=F)
+
+            # If the plots are huge ggsave will throw an error even if limitsize=T, so I'm leaving limitsize=F and instead using trycatch
+            tryCatch(
+                {
+                    ggsave(p, filename = paste0("pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.pdf"), device = "pdf", height = 5+0.5*nrow(df_subset), units = "cm", limitsize=F)
+                },
+                error=function(e) {
+                    print(paste0("Could not save pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.pdf because of the following error:\n", e))
+                }
+            )
+            tryCatch(
+                {
+                    ggsave(p, filename = paste0("pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.png"), device = "png", height = 5+0.5*nrow(df_subset), units = "cm", limitsize=F)
+                },
+                error=function(e) {
+                    print(paste0("Could not save pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.png because of the following error:\n", e))
+                }
+            )
+            tryCatch(
+                {
+                    ggsave(p, filename = paste0("pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.svg"), device = "svg", height = 5+0.5*nrow(df_subset), units = "cm", limitsize=F)
+                },
+                error=function(e) {
+                    print(paste0("Could not save pathway_analysis", "/", fname, "/enrichment_plots/", make.names(db_source), "_pathway_enrichment_plot.svg because of the following error:\n", e))
+                }
+            )
 
             # Plotting heatmaps and KEGG pathways for all pathways
             print("Plotting heatmaps...")
@@ -2399,7 +2434,7 @@ Inside the pathway analysis results folder, a subfolder for each contrast used f
 - `*_gost_pathway_venn_diagram.pdf/png`
     - Venn diagrams showing the numbers of enriched pathways when using a background gene list vs when not using a bg list.
 - `enrichment_plots/*_pathway_enrichment_plot.{pdf/png/svg}`
-    - Barplots showing the proportion of differentially expressed genes in the pathway for a certain pathway database.
+    - Barplots showing the proportion of differentially expressed genes in the pathway for a certain pathway database (might be missing if too many pathways were enriched for fitting into a plot).
 - `gost_pathway_gostplot.{pdf/png/svg}`
     - Manhattan plots displaying all enriched pathways.
 - `KEGG_pathways/`
@@ -2411,7 +2446,7 @@ gost_text,
 
 "\n
 ## Enriched pathways
-The plot below summarizes the pathways that were found significantly enriched in DE genes for each contrast (padj value <= ", pval_text, ").
+The plot below summarizes the pathways that were found significantly enriched in DE genes for each contrast (padj value <= ", pathway_pval_text, ").
 Only contrasts for which an enriched pathway was found are shown.
 Hover over the dots to reveal the pathway names. The table below provides more detail on all enriched pathways."))
 ```
@@ -2447,7 +2482,7 @@ if (length(q_list) > 0) {
                     significant=T,
                     correction_method="fdr",
                     sources=datasources,
-                    user_threshold=params$adj_pval_threshold,
+                    user_threshold=params$pathway_adj_pval_threshold,
                     custom_bg=custom_background,
                     domain_scope="custom_annotated"
         )
@@ -2457,7 +2492,7 @@ if (length(q_list) > 0) {
                     significant=T,
                     correction_method="fdr",
                     sources=datasources,
-                    user_threshold=params$adj_pval_threshold,
+                    user_threshold=params$pathway_adj_pval_threshold,
                     domain_scope="annotated"
         )
     }
@@ -2470,7 +2505,7 @@ if (length(q_list) > 0) {
 
     if (nrow(path_enrich) > 0){
         pg2 <- gostplot(gostres, capped=T, interactive=T)
-        pg2[['x']][['layout']][['annotations']][[1]][['x']] <- -params$adj_pval_threshold
+        pg2[['x']][['layout']][['annotations']][[1]][['x']] <- -params$pathway_adj_pval_threshold
 
         # limit gostplot y maximum dynamically for all subplots
         for (counter in c(1:length(contrast_files))) {
@@ -2691,7 +2726,7 @@ For pathway analysis, the R packages `gprofiler2 v", version_gprofiler2,
 " `, `AnnotationDbi v", version_annotation,
 "` and `", name_species, " v", version_annotation,
 "` were used. ", database_string, ".\n",
-"Pathways were classified as enriched for those genes with an adjusted p-value <= ", pval_text, "."
+"Pathways were classified as enriched for those genes with an adjusted p-value <= ", pathway_pval_text, "."
 ))
 ```
 

diff --git a/bin/Execute_report.R b/bin/Execute_report.R
@@ -37,6 +37,7 @@ option_list = list(
     make_option("--datasources", type="character", default=NULL, help="Which datasources to use for pathway analysis.", metavar="character"),
     make_option("--heatmaps_cluster_rows", action="store_true", default=FALSE, help="Whether to activate row clustering when generating heatmaps of gene expression in enriched pathways."),
     make_option("--heatmaps_cluster_cols", action="store_true", default=FALSE, help="Whether to activate column clustering when generating heatmaps of gene expression in enriched pathways."),
+    make_option("--pathway_adj_pval_threshold", type="double", default=-1, help="Which adjusted p value threshold to use for pathway analysis. Will by default use the same value as the value of --adj_pval_threshold (default 0.05)."),
 
     make_option(c("-s", "--proj_summary"), type="character", default=NULL, help="Project summary file", metavar="character"),
     make_option(c("--path_quote"), type="character", default=NULL, help="Path to the quote PDF", metavar="character"),
@@ -89,6 +90,7 @@ rmarkdown::render(opt$report, output_file = opt$output, knit_root_dir = wd, outp
                                 datasources = opt$datasources,
                                 heatmaps_cluster_rows = opt$heatmaps_cluster_rows,
                                 heatmaps_cluster_cols = opt$heatmaps_cluster_cols,
+                                pathway_adj_pval_threshold = opt$pathway_adj_pval_threshold,
 
                                 path_proj_summary = opt$proj_summary,
                                 path_quote = opt$path_quote,

diff --git a/conf/test.config b/conf/test.config
@@ -28,6 +28,7 @@ params {
     software_versions = 'https://raw.githubusercontent.com/qbic-pipelines/rnadeseq/dev/testdata/software_versions.csv'
     multiqc = 'https://raw.githubusercontent.com/qbic-pipelines/rnadeseq/dev/testdata/MultiQC.zip'
     run_pathway_analysis = true
+    pathway_adj_pval_threshold = 0.0004
     datasources = 'KEGG,REAC'
     genome = 'GRCm38'
     quote = 'https://raw.githubusercontent.com/qbic-pipelines/rnadeseq/dev/testdata/offer_example.pdf'