Merge remote-tracking branch 'refs/remotes/origin/main'

LUMC · Dec 18, 2024 · 5771894 · 5771894
2 parents d697024 + ba2eb7b
commit 5771894
Show file tree

Hide file tree

Showing 44 changed files with 1,632 additions and 192 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,29 @@ Changelog
 v2.2.1-dev
 **********
 
+Breaking changes
+================
+* The `bed_variant_call_regions` option has been removed, variants are now
+  called for all genes present in the `gtf` file.
+* Add graphviz/`dot` as a dependency (developer only).
+
+Novel module
+============
+* Add novel module, **expression**, which analyzes gene expression.
+  * Add optional input `strandedness` to the sample configuration.
+  * Add json output file for the expression module.
+
+Bugfixes
+========
+* Fix a rare bug where different modules use the same multiqc file list.
+* Fix a bug with filtering VEP records that contain multiple population
+  frequency records for a single variant.
+
+Updates
+=======
+* Add ability to generate configurations for each module using the
+  `utilities/create-config.py` script.
+
 **********
 v2.1.3
 **********

diff --git a/Snakefile b/Snakefile
@@ -28,6 +28,7 @@ config["qc-seq"]["pepfile"] = config["pepfile"]
 config["snv-indels"]["pepfile"] = config["pepfile"]
 config["itd"]["pepfile"] = config["pepfile"]
 config["fusion"]["pepfile"] = config["pepfile"]
+config["expression"]["pepfile"] = config["pepfile"]
 
 
 # Define HAMLET modules
@@ -122,13 +123,35 @@ use rule plot_fusions from fusion as fusion_plot_fusions with:
         fusion.containers["arriba"]
 
 
+module expression:
+    snakefile:
+        "includes/expression/Snakefile"
+    config:
+        config["expression"]
+
+
+use rule * from expression as expression_*
+
+
+# Connect the output of snv-indels to expression
+use rule normalized_coverage from expression as expression_normalized_coverage with:
+    input:
+        bam=align.module_output.bam,
+        bai=align.module_output.bai,
+        counts=align.module_output.counts,
+        gtf=config["expression"]["gtf"],
+        bed=config["expression"].get("bed", []),
+        src=srcdir("includes/expression/scripts/coverage.py"),
+
+
 rule create_summary:
     """Combines statistics and other info across modules to a single JSON file per sample."""
     input:
         idm=config["snv-indels"]["ref_id_mapping"],
         fusion_json=fusion.module_output.json,
         snv_indels_json=align.module_output.json,
         itd_json=itd.module_output.json,
+        expression_json=expression.module_output.json,
         scr=srcdir("scripts/create_summary.py"),
     params:
         pipeline_ver=PIPELINE_VERSION,
@@ -146,6 +169,7 @@ rule create_summary:
             --sample-name {wildcards.sample} \
             --module {input.fusion_json} \
             --module {input.snv_indels_json} \
+            --module {input.expression_json} \
             --module {input.itd_json} > {output.js} 2>{log}
         """
 
@@ -208,11 +232,12 @@ rule multiqc:
     input:
         qc_stats=qc_seq.module_output.multiqc_files,
         snv_indel_stats=align.module_output.multiqc_files,
+        expression_stats=expression.module_output.multiqc_files,
         config=srcdir("cfg/multiqc.yml"),
     params:
         filelist="multiqc_filelist.txt",
         depth=2,
-        modules=multiqc_modules(),
+        exclude="dedup",
     output:
         html="multiqc_hamlet.html",
     log:
@@ -223,7 +248,7 @@ rule multiqc:
         """
         rm -f {params.filelist}
 
-        for fname in {input.qc_stats} {input.snv_indel_stats}; do
+        for fname in {input.qc_stats} {input.snv_indel_stats} {input.expression_stats}; do
             echo $fname >> {params.filelist}
         done
 
@@ -235,6 +260,6 @@ rule multiqc:
         --fn_as_s_name \
         --file-list {params.filelist} \
         --config {input.config} \
-        {params.modules} \
+        --exclude {params.exclude} \
         --filename {output.html} 2> {log}
         """
diff --git a/cfg/multiqc.yml b/cfg/multiqc.yml
@@ -6,7 +6,7 @@ report_header_info:
   - Pipeline: "https://github.com/LUMC/HAMLET"
   - Platform: "Illumina RNAseq data"
 
-
+violin_min_threshold_outliers: 200
 show_analysis_paths: False
 show_analysis_time: False
 
@@ -35,6 +35,15 @@ report_section_order:
   picard:
     order: 700
 
+remove_sections:
+  - sequali_sequence_duplication_levels
+  - sequali_top_overrepresented_sequences_read1
+  - sequali_top_overrepresented_sequences_read2
+  - sequali_adapter_content_from_overlap
+  - sift_summary
+  - polyphen_summary
+  - position_in_protein
+
 sample_names_replace_regex: True
 sample_names_replace:
   "(.+) \\| qc-seq \\| (.+).cutadapt.json": "\\1"

diff --git a/common.smk b/common.smk
@@ -18,9 +18,3 @@ containers = {
 
 # The version of HAMLET
 PIPELINE_VERSION = "v2.2.1-dev"
-
-
-def multiqc_modules():
-    """Define which MultiQC modules to run here"""
-    modules = ["cutadapt", "sequali", "star", "picard", "vep"]
-    return [f" --module {module}" for module in modules]
diff --git a/docs/source/expression.rst b/docs/source/expression.rst
@@ -2,11 +2,12 @@ expression
 ==========
 
 The `expression` module is responsible for determining gene expression levels
-from STAR bam and count files. Although the strandedness of the library
-preparation is important when determining, the module itself is strand
-agnostic. Instead, we take inspiration from STAR and produce output files for
-unstranded, forward stranded and reverse stranded libraries, and leave it to
-the user to select the relevant output for their samples.
+from STAR bam and count files. For the highest accuracy the strandedness of the
+library preparation can be specified. By default, the module assumes the data
+is unstranded.
+
+The expression of a configurable set of housekeeping genes is used to normalize
+the expression of the genes of interest.
 
 Tools
 -----
@@ -15,42 +16,73 @@ housekeeping genes to normalize gene expression levels.
 
 Input
 -----
-The input for this module is one BAM file and one STAR count table
+The minimal input for this module is one BAM file and one STAR count table
 specified in a PEP configuration file, as is shown below.
 
-.. csv-table:: Example input for the expression module
+.. csv-table:: Minimal input for the expression module
    :delim: ,
    :file: ../../test/pep/expression.csv
 
+For more accurate results, it is possible to specify the strandedness of your
+RNA library prep (`unstranded`, `forward` and `reverse`). If strandedness is
+not specified, all samples will be treated as unstranded.
+
+.. csv-table:: Sample configuration with strandedness
+   :delim: ,
+   :file: ../../test/pep/expression_strandedness.csv
+
 Output
 ------
 
-* Three files with the normalized gene expression levels, one for each strandedness.
-* A single MultiQC report which contains the same data.
+* The genes specified under the `report` section of the configuration will be
+  included in the HAMLET pdf repor.
+* All genes from the `bed` file and the `genes_of_interest` will be included in
+  the MultiQC report. Stranded (forward and reverse) and unstranded samples
+  will be listed separately, since their values cannot be compared directly.
 
 Configuration
 -------------
 The following options are available for the `expression` module
 
 
-Example
-^^^^^^^
-.. literalinclude:: ../../test/data/config/expression.json
-   :language: json
-
 Configuration options
 ^^^^^^^^^^^^^^^^^^^^^
 .. list-table:: Configuration options
 
   * - Option
     - Description
     - Required
-  * - housekeeping
-    - A list of genes to use for normalizing the expression
-    - yes
   * - gtf
     - A GTF file, to look up the ENSG for the housekeeping genes
     - yes
+  * - housekeeping
+    - A list of genes to use for normalizing the expression
+    - yes
   * - bed
     - A BED file with genomic regions (genes) to quantify
-    - yes
+    - no
+  * - genes_of_interest
+    - A list of gene names to quantify (must be present in the gtf)
+    - no
+  * - report
+    - Genes to include in the PDF report, can include names from the bed file
+    - no
+
+Example
+^^^^^^^
+.. code-block:: json
+
+  {
+    "gtf": "test/data/reference/hamlet-ref.gtf",
+    "housekeeping": [
+      "MT-CO2"
+    ],
+    "bed": "path/to/bed/file.bed",
+    "genes_of_interest": [
+      "MT-ND3",
+      "MT-ND2"
+    ],
+    "report": [
+      "MT-ND3"
+    ],
+  }
diff --git a/docs/source/fusion.rst b/docs/source/fusion.rst
@@ -27,6 +27,7 @@ The output of this module are a JSON file with an overview of the most important
 
 Configuration
 -------------
+You can automatically generate a configuration for the fusion module using the `utilities/create-config.py` script.
 
 Example
 ^^^^^^^

diff --git a/docs/source/itd.rst b/docs/source/itd.rst
@@ -25,6 +25,7 @@ The output of this module are a JSON file with an overview of the most important
 Configuration
 -------------
 The configuration for this module is tailored to the provided reference files, be very careful if you want to modify any of these settings.
+You can automatically generate a configuration for the fusion module using the `utilities/create-config.py` script.
 
 .. literalinclude:: ../../test/data/config/itd.json
    :language: json

diff --git a/docs/source/qc-seq.rst b/docs/source/qc-seq.rst
@@ -25,6 +25,8 @@ The output of this module are one set of merged FastQ files per sample, as well
 
 Configuration
 -------------
+You can automatically generate a configuration for the fusion module using the `utilities/create-config.py` script.
+
 Example
 ^^^^^^^
 .. literalinclude:: ../../test/data/config/qc-seq.json

diff --git a/docs/source/snv-indels.rst b/docs/source/snv-indels.rst
@@ -36,6 +36,7 @@ The output of this module are a JSON file with an overview of the most important
 
 Configuration
 -------------
+You can automatically generate a configuration for the fusion module using the `utilities/create-config.py` script.
 
 Example
 ^^^^^^^
@@ -81,9 +82,6 @@ Configuration options
   * - bed_variant_hotspots
     - BED file of hotspot regions
     - yes
-  * - bed_variant_call_regions
-    - BED file of regions to call variants
-    - yes
   * - gtf
     - GTF file with transcripts, used by STAR
     - yes

diff --git a/environment.yml b/environment.yml
@@ -15,3 +15,4 @@ dependencies:
   - sphinx=7.1.2
   - sphinx-rtd-theme
   - pysam=0.22
+  - graphviz
-Original file line number
+Diff line change
@@ Expand Up @@
     Configuration
     -------------
+    You can automatically generate a configuration for the fusion module using the `utilities/create-config.py` script.
     Example
     ^^^^^^^
@@ Expand Down @@