Merge MultiQC results by strandedness

LUMC · Dec 5, 2024 · 99e0a38 · 99e0a38
1 parent 16e6656
commit 99e0a38
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 39 deletions.
diff --git a/includes/expression/Snakefile b/includes/expression/Snakefile
@@ -80,6 +80,7 @@ rule merge_samples:
         src=srcdir("scripts/multiqc.py"),
     params:
         samples=[sample.sample for sample in samples],
+        strandedness=[get_strand(sample) for sample in samples],
     output:
         unstranded="merged_expression_unstranded_mqc.tsv",
         forward="merged_expression_forward_mqc.tsv",
@@ -94,20 +95,8 @@ rule merge_samples:
         python3 {input.src} \
             --counts {input.counts} \
             --samples {params.samples} \
-            --strand unstranded \
-            > {output.unstranded} 2> {log}
-
-        python3 {input.src} \
-            --counts {input.counts} \
-            --samples {params.samples} \
-            --strand forward \
-            > {output.forward} 2>> {log}
-
-        python3 {input.src} \
-            --counts {input.counts} \
-            --samples {params.samples} \
-            --strand reverse \
-            > {output.reverse_} 2>> {log}
+            --strandedness {params.strandedness}\
+            2> {log}
         """
 
 

diff --git a/includes/expression/scripts/multiqc.py b/includes/expression/scripts/multiqc.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import argparse
+from collections import defaultdict
 
 
 def read_expression(fname, strand):
@@ -12,31 +13,52 @@ def read_expression(fname, strand):
             yield spline[0], spline[column]
 
 
-def main(samples, countfiles, strand):
-    # Did we already write the gene header?
-    gene_header = None
+def main(samples, countfiles, strandedness):
+    # Group the samples by strand
+    grouped = defaultdict(list)
+    for sample, fname, strand in zip(samples, countfiles, strandedness):
+        grouped[strand].append((sample, fname))
 
-    # First, we print the multiqc header
-    multiqc_header = f"""# plot_type: "table"
+    for strand in ["unstranded", "forward", "reverse"]:
+        samples = [x[0] for x in grouped[strand]]
+        fnames = [x[1] for x in grouped[strand]]
+
+        write_multiqc(samples, fnames, strand)
+
+
+def write_multiqc(samples, countfiles, strand):
+    outfile = f"merged_expression_{strand}_mqc.tsv"
+
+    with open(outfile, "wt") as fout:
+        # If there are no samples, we don't even write the header
+        if not samples:
+            return
+        # Did we already write the gene header?
+        gene_header = None
+
+        # First, we print the multiqc header
+        multiqc_header = f"""# plot_type: "table"
 # id: mqc_expression_{strand}
 # section_name: "{strand.capitalize()}"
 # description: "Normalized gene expression assuming <b>{strand} library preparation</b>." """
 
-    print(multiqc_header)
+        print(multiqc_header, file=fout)
 
-    for sample, fname in zip(samples, countfiles):
-        # Store the normalized expression for each gene per strand
-        expression = dict()
-        for values in read_expression(fname, args.strand):
-            gene, level = values
-            expression[gene] = level
+        for sample, fname in zip(samples, countfiles):
+            # Store the normalized expression for each gene per strand
+            expression = dict()
+            for values in read_expression(fname, strand):
+                gene, level = values
+                expression[gene] = level
 
-        # IF this is the first time, write the header
-        if not gene_header:
-            gene_header = list(expression.keys())
-            print("Sample", *gene_header, sep="\t")
+            # IF this is the first time, write the header
+            if not gene_header:
+                gene_header = list(expression.keys())
+                print("Sample", *gene_header, sep="\t", file=fout)
 
-        print(sample, *(expression[gene] for gene in gene_header), sep="\t")
+            print(
+                sample, *(expression[gene] for gene in gene_header), sep="\t", file=fout
+            )
 
 
 if __name__ == "__main__":
@@ -50,13 +72,20 @@ def main(samples, countfiles, strand):
         help="Files with normalized expression levels",
     )
     parser.add_argument(
-        "--strand", required=True, choices=["unstranded", "forward", "reverse"]
+        "--strandedness",
+        required=True,
+        nargs="+",
+        choices=["unstranded", "forward", "reverse"],
     )
 
     args = parser.parse_args()
 
-    if len(args.samples) != len(args.counts):
-        msg = "--counts and --samples must have the same number of items"
+    if len(args.samples) != len(args.counts) or len(args.samples) != len(
+        args.strandedness
+    ):
+        msg = (
+            "--counts, --samples and --strandedness must have the same number of items"
+        )
         raise RuntimeError(msg)
 
-    main(args.samples, args.counts, args.strand)
+    main(args.samples, args.counts, args.strandedness)
diff --git a/test/test_expression.yml b/test/test_expression.yml
@@ -88,12 +88,10 @@
     - path: merged_expression_unstranded_mqc.tsv
       contains:
         - "id: mqc_expression_unstranded"
+    # Empty file since this sample is unstranded
     - path: merged_expression_forward_mqc.tsv
-      contains:
-        - "id: mqc_expression_forward"
+    # Empty file since this sample is unstranded
     - path: merged_expression_reverse_mqc.tsv
-      contains:
-        - "id: mqc_expression_reverse"
     # Test that we create the multiqc report
     - path: multiqc_expression.html
       contains:
@@ -160,6 +158,9 @@
         - "X\tSRR8615409\tSRR8615409"
         # Results for the forward strand
         - "ENSG00000211459\t18\t18"
+    - path: merged_expression_forward_mqc.tsv
+      contains:
+        - "id: mqc_expression_forward"
 
 - name: Test running the expression module with a reverse stranded sample
   tags:
@@ -179,6 +180,9 @@
         - "X\tSRR8615409\tSRR8615409"
         # Results for the reverse strand
         - "ENSG00000211459\t12\t12"
+    - path: merged_expression_reverse_mqc.tsv
+      contains:
+        - "id: mqc_expression_reverse"
 
 
 - name: Check formatting of Snakemake files