improve report and document software versions

epigen · Feb 21, 2024 · e974d6f · e974d6f
1 parent 0aa2ad6
commit e974d6f
Show file tree

Hide file tree

Showing 21 changed files with 186 additions and 197 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,9 +1,7 @@
 
-# alwayse use absolute paths
-
 ##### RESOURCES #####
 mem: '32000'
-threads: 2
+threads: 1 # only DEA rule is multicore and gets 8*threads
 partition: 'shortq'
 
 ##### GENERAL #####
@@ -42,3 +40,11 @@ filters:
 volcano:
     pCutoff: 0.05
     FCcutoff: 0.1
+
+# path(s) to feature lists as plain text files (.txt) with one gene per line.
+# used to highlight features in volcano plots and generate LFC clustered heatmaps.
+# only use camelCase for the feature_list names like in the examples below.
+# if not used: put an empty entry e.g., noGenes: ""
+feature_lists:
+    favoriteGenes: ""
+    interestingGenes: ""
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -25,10 +25,10 @@ annot_dict = annot.to_dict('index')
 
 # load feature list paths and keep only non-empty
 feature_lists = []
-# feature_lists_dict = config["feature_lists"]
-# feature_lists_dict = {k: v for k, v in feature_lists_dict.items() if v!=""}
-# if feature_lists_dict is not None:
-#     feature_lists = feature_lists + list(feature_lists_dict.keys())
+feature_lists_dict = config["feature_lists"]
+feature_lists_dict = {k: v for k, v in feature_lists_dict.items() if v!=""}
+if feature_lists_dict is not None:
+    feature_lists = feature_lists + list(feature_lists_dict.keys())
 
 result_path = os.path.join(config["result_path"], module_name)
 
@@ -44,14 +44,14 @@ rule all:
         stats = expand(os.path.join(result_path,'{analysis}','stats.csv'),
                                analysis = analyses,
                               ),
-#         heatmaps = expand(os.path.join(result_path,'{analysis}','plots','heatmap','{feature_list}.png'),
-#                                  analysis = analyses,
-#                                  feature_list = feature_lists + ['FILTERED'],
-#                               ),
-        envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['seurat','volcanos','ggplot','heatmap']),
+        heatmaps = expand(os.path.join(result_path,'{analysis}','plots','heatmap','{feature_list}.png'),
+                                 analysis = analyses,
+                                 feature_list = feature_lists + ['FILTERED'],
+                         ),
+        envs = expand(os.path.join(config["result_path"],'envs',module_name,'{env}.yaml'),env=['seurat','visualization']), # 'volcanos','ggplot','heatmap'
         configs = os.path.join(config["result_path"],'configs',module_name,'{}_config.yaml'.format(config["project_name"])),
         annotations = os.path.join(config["result_path"],'configs',module_name,'{}_annot.csv'.format(config["project_name"])),
-#         feature_lists = expand(os.path.join(config["result_path"],'configs','dea_seurat','{feature_list}.txt'), feature_list = feature_lists),
+        feature_lists = expand(os.path.join(config["result_path"],'configs','dea_seurat','{feature_list}.txt'), feature_list = feature_lists),
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 1)

diff --git a/workflow/dags/rulegraph.svg b/workflow/dags/rulegraph.svg
diff --git a/workflow/envs/ggplot.yaml b/workflow/envs/ggplot.yaml
diff --git a/workflow/envs/heatmap.yaml b/workflow/envs/heatmap.yaml
diff --git a/workflow/envs/seurat.yaml b/workflow/envs/seurat.yaml
@@ -4,5 +4,6 @@ channels:
   - defaults
 dependencies:
   - r-seurat=4.4.0
-  - bioconductor-limma
-  - r-data.table
+  - bioconductor-limma=3.58.1
+  - r-data.table=1.14.10
+  - r-future=1.33.1
diff --git a/workflow/envs/visualization.yaml b/workflow/envs/visualization.yaml
@@ -0,0 +1,11 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconductor-enhancedvolcano=1.12.0
+  - r-ggplot2=3.4.2
+  - r-ggplotify=0.1.0
+  - r-pheatmap=1.0.12
+  - r-data.table=1.14.8
+  - r-reshape2=1.4.4
diff --git a/workflow/envs/volcanos.yaml b/workflow/envs/volcanos.yaml
diff --git a/workflow/report/dea_stats.rst b/workflow/report/dea_stats.rst
@@ -1 +1 @@
-Statistics and visualizations of ALL and FILTERED DEA result statistics of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
+Statistics and visualizations of filtered DEA results of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
diff --git a/workflow/report/heatmap.rst b/workflow/report/heatmap.rst
@@ -0,0 +1 @@
+Clustered heatmap visualizing average log2 fold changes of {{snakemake.wildcards["feature_list"]}} features resulting from DEA {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
diff --git a/workflow/report/lfc_heatmap.rst b/workflow/report/lfc_heatmap.rst
diff --git a/workflow/report/volcano.rst b/workflow/report/volcano.rst
@@ -1 +1 @@
-Volcano plot panel visualizing DEA results of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
+Volcano plot panel visualizing DEA results of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}} highlighting {{snakemake.wildcards["feature_list"]}} features.
diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst
@@ -1 +1 @@
-A `Snakemake workflow for performing differential expression analyses (DEA) <https://github.com/epigen/dea_seurat>`_ on (multimodal) sc/snRNA-seq data powered by the R package `Seurat <https://satijalab.org/seurat/index.html>`_.
+A `Snakemake workflow for performing differential expression analyses (DEA) <https://github.com/epigen/dea_seurat>`_ on (multimodal) scRNA-seq data powered by the R package `Seurat <https://satijalab.org/seurat/index.html>`_.
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -2,3 +2,6 @@
 
 def get_data_path(wildcards):
     return annot.loc[wildcards.analysis,'data']
+
+def get_feature_list_path(wildcards):
+    return config["feature_lists"][wildcards.feature_list]
diff --git a/workflow/rules/dea.smk b/workflow/rules/dea.smk
@@ -8,7 +8,7 @@ rule dea:
         all_features = os.path.join(result_path,'{analysis}','feature_lists',"ALL_features.txt"),
     resources:
         mem_mb=config.get("mem", "16000"),
-    threads: config.get("threads", 1)
+    threads: 8*config.get("threads", 1)
     conda:
         "../envs/seurat.yaml"
     log:
@@ -35,29 +35,20 @@ rule aggregate:
                                       "type": "table",
                                       "misc": "CSV",
                                   }),
-#         dea_filtered_stats = report(os.path.join(result_path,'{analysis}','DEA_FILTERED_stats.csv'), 
-#                                   caption="../report/dea_stats.rst", 
-#                                   category="{}_dea_seurat".format(config["project_name"]), 
-#                                   subcategory="{analysis}"),
-#         dea_filtered_lfc = os.path.join(result_path,'{analysis}','DEA_FILTERED_LFC.csv'),
         dea_stats_plot = report(os.path.join(result_path,'{analysis}','plots','stats.png'), 
                                   caption="../report/dea_stats.rst", 
                                   category="{}_{}".format(config["project_name"], module_name),
                                   subcategory="{analysis}",
                                   labels={
                                       "name": "Statistics",
-                                      "type": "stacked bar plot",
+                                      "type": "Bar plot",
                                       "misc": "PNG",
                                   }),
-#         dea_filtered_stats_plot = report(os.path.join(result_path,'{analysis}','plots','DEA_FILTERED_stats.png'), 
-#                                   caption="../report/dea_stats.rst", 
-#                                   category="{}_dea_seurat".format(config["project_name"]), 
-#                                   subcategory="{analysis}"),
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 1)
     conda:
-        "../envs/ggplot.yaml"
+        "../envs/visualization.yaml"
     log:
         os.path.join("logs","rules","aggregate_{analysis}.log"),
     params:

diff --git a/workflow/rules/envs_export.smk b/workflow/rules/envs_export.smk
@@ -60,3 +60,25 @@ rule annot_export:
         """
         cp {input} {output}
         """
+
+# export used gene lists for documentation and reproducibility
+rule feature_list_export:
+    input:
+        get_feature_list_path,
+    output:
+        feature_lists = report(os.path.join(config["result_path"],'configs',module_name,'{feature_list}.txt'), 
+                            caption="../report/feature_lists.rst", 
+                            category="Configuration", 
+                            subcategory="{}_{}".format(config["project_name"], module_name)
+                           ),
+    resources:
+        mem_mb=1000, #config.get("mem_small", "16000"),config.get("mem", "16000"),
+    threads: config.get("threads", 1)
+    log:
+        os.path.join("logs","rules","feature_list_export_{feature_list}.log"),
+    params:
+        partition=config.get("partition"),
+    shell:
+        """
+        cp {input} {output}
+        """
diff --git a/workflow/rules/visualize.smk b/workflow/rules/visualize.smk
@@ -18,7 +18,7 @@ rule volcanos:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 1)
     conda:
-        "../envs/volcanos.yaml"
+        "../envs/visualization.yaml"
     log:
         os.path.join("logs","rules","volcanos_{analysis}_{feature_list}.log"),
     params:
@@ -33,22 +33,21 @@ rule volcanos:
 rule heatmap:
     input:
         results = os.path.join(result_path,'{analysis}','results.csv'),
-#         dea_filtered_lfc = os.path.join(result_path,'{analysis}','DEA_FILTERED_LFC.csv'),
     output:
         lfc_heatmap = report(os.path.join(result_path,'{analysis}','plots','heatmap','{feature_list}.png'),
-                              caption="../report/lfc_heatmap.rst",
+                              caption="../report/heatmap.rst",
                               category="{}_{}".format(config["project_name"], module_name),
                               subcategory="{analysis}",
                               labels={
                                       "name": "Heatmap",
-                                      "type": "effect sizes",
+                                      "type": "Effect sizes",
                                       "misc": "{feature_list}",
                                   }),
     resources:
         mem_mb=config.get("mem", "16000"),
     threads: config.get("threads", 1)
     conda:
-        "../envs/heatmap.yaml"
+        "../envs/visualization.yaml"
     log:
         os.path.join("logs","rules","lfc_heatmap_{analysis}_{feature_list}.log"),
     params:

diff --git a/workflow/scripts/aggregate.R b/workflow/scripts/aggregate.R
@@ -143,9 +143,10 @@ dea_filtered_results_p <- ggplot(plot_stats_df, aes(x=groups, y=n_features, fill
                                              geom_bar(stat="identity", position="identity") +
                                              xlab(metadata) +
                                              ylab("number of differential features") +
-                                             scale_fill_manual(values=list("up"="red", "down"="blue"), drop=FALSE) +
+                                             scale_fill_manual(values=list("down"="blue", "up"="red"), drop=FALSE) +
+                                             scale_y_continuous(labels = function(y) sapply(y, function(y) ifelse(y < 0, paste0(sub("-", "", as.character(y))), y))) +
                                              custom_theme +
-                                             theme(legend.position = "none",
+                                             theme(#legend.position = "none",
                                                    axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size = 6)
                                                   )
 

diff --git a/workflow/scripts/dea.R b/workflow/scripts/dea.R
@@ -1,6 +1,13 @@
 #### load libraries & utility function 
 library("Seurat")
 library("data.table")
+library("future")
+
+# change the current plan to access all available cores for parallelization
+plan("multicore") # "multisession" does not work
+plan()
+# set global size for parallelization -> TODO: dynamic depending on data size, RAM and available threads. how?
+options(future.globals.maxSize = 1000 * 1024^2)
 
 # inputs
 object_path <- snakemake@input[[1]]

diff --git a/workflow/scripts/heatmap.R b/workflow/scripts/heatmap.R
@@ -11,7 +11,6 @@ snakemake@source("./utils.R")
 
 # inputs
 dea_result_path <- snakemake@input[["results"]]
-# dea_filtered_lfc_path <- snakemake@input[["dea_filtered_lfc"]]
 
 # outputs
 lfc_heatmap_path <- snakemake@output[["lfc_heatmap"]]
@@ -28,14 +27,9 @@ min_pct <- snakemake@config[["filters"]][["min_pct"]]
 feature_list_name <- snakemake@wildcards[["feature_list"]]
 
 # plot specifications
-width <- 0.25
+width <- 0.15
 height <- 0.15
 
-
-### load LFC DEA results
-# dea_lfc <- read.csv(file=file.path(dea_filtered_lfc_path), row.names = 1)
-# dea_lfc <- data.frame(fread(file.path(dea_filtered_lfc_path), header=TRUE), row.names=1)
-
 # load dea results
 dea_results <- data.frame(fread(file.path(dea_result_path), header=TRUE))
 
@@ -47,6 +41,8 @@ if (feature_list_name=="FILTERED"){
 } else {
     feature_list_path <- snakemake@config[["feature_lists"]][[feature_list_name]]
     feature_list <- scan(file.path(feature_list_path), character())
+    # ensure features are in the results
+    feature_list <- unique(intersect(feature_list, dea_results$feature))
 }
 
 # make LFC dataframe
@@ -69,6 +65,7 @@ lfc_df[is.na(lfc_df)] <- 0
 # indicate significance
 adjp_df[adjp_df<=adj_pval] <- "*"
 adjp_df[adjp_df>adj_pval] <- ""
+adjp_df[is.na(adjp_df)] <- ""
 
 ### visualize LFC of DEA results as heatmap
 height_panel <-  if (nrow(lfc_df)<100) (height * nrow(lfc_df) + 2) else 5
@@ -77,7 +74,7 @@ width_panel <- width * ncol(lfc_df) + 2
 # make heatmap
 lfc_heatmap <- as.ggplot(pheatmap(lfc_df,
                                   display_numbers = if(nrow(lfc_df)<100) adjp_df else FALSE,
-                                  main=paste0("Average log2FC of ", feature_list_name," features"),
+                                  main=paste0("Avg.log2FC of ", feature_list_name," features\n", metadata,' vs ', control),
                                   cluster_cols = ifelse(ncol(lfc_df)>1, TRUE, FALSE),
                                   cluster_rows = ifelse(nrow(lfc_df)>1, TRUE, FALSE),
                                   show_rownames = ifelse(nrow(lfc_df)<100, TRUE, FALSE),
@@ -96,48 +93,9 @@ lfc_heatmap <- as.ggplot(pheatmap(lfc_df,
                                   silent = TRUE
                                  )
                         )
-
+# save heatmap
 ggsave_new(filename = feature_list_name, 
            results_path=dirname(lfc_heatmap_path), 
            plot=lfc_heatmap, 
            width=width_panel, 
            height=height_panel)
-
-
-# ########### OLD CODE
-
-# # set NA values to 0 (NA because below LFC threshold during testing or filtering)
-# dea_lfc[is.na(dea_lfc)] <- 0
-
-# ### visualize LFC of DEA results as heatmap
-# width_panel <- width * ncol(dea_lfc) + 3
-
-# annot <- data.frame(group=gsub("group_","",colnames(dea_lfc)))
-# rownames(annot) <- colnames(dea_lfc)
-
-# # make heatmap
-# lfc_heatmap <- as.ggplot(pheatmap(dea_lfc,
-#                                   show_rownames=F,
-#                                   show_colnames=T,
-#                                   cluster_cols = ncol(dea_lfc) > 1,
-#                                   fontsize = 5,
-#                                   angle_col = 45,
-#                                   treeheight_row = 25,
-#                                   treeheight_col = 10,
-# #                                   annotation_col = annot,
-#                                   breaks=seq(-max(abs(dea_lfc)), max(abs(dea_lfc)), length.out=200),
-#                                   color=colorRampPalette(c("blue", "white", "red"))(200),
-#                                   annotation_names_col = F,
-#                                   silent = TRUE
-#                                  )
-#                         )
-
-# # save plot
-# # options(repr.plot.width=width_panel, repr.plot.height=height)
-# # print(lfc_heatmap)
-
-# ggsave_new(filename = feature_list_name, 
-#            results_path=dirname(lfc_heatmap_path), 
-#            plot=lfc_heatmap, 
-#            width=width_panel, 
-#            height=height)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Statistics and visualizations of ALL and FILTERED DEA result statistics of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
		Statistics and visualizations of filtered DEA results of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Clustered heatmap visualizing average log2 fold changes of {{snakemake.wildcards["feature_list"]}} features resulting from DEA {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Volcano plot panel visualizing DEA results of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}}.
		Volcano plot panel visualizing DEA results of analysis {{snakemake.wildcards["analysis"]}} using assay {{snakemake.params["assay"]}} to compare groups in {{snakemake.params["metadata"]}} versus {{snakemake.params["control"]}} highlighting {{snakemake.wildcards["feature_list"]}} features.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		A `Snakemake workflow for performing differential expression analyses (DEA) <https://github.com/epigen/dea_seurat>`_ on (multimodal) sc/snRNA-seq data powered by the R package `Seurat <https://satijalab.org/seurat/index.html>`_.
		A `Snakemake workflow for performing differential expression analyses (DEA) <https://github.com/epigen/dea_seurat>`_ on (multimodal) scRNA-seq data powered by the R package `Seurat <https://satijalab.org/seurat/index.html>`_.