From 4636c88eb4c5829dc0c89d104b3d9d613c335bf1 Mon Sep 17 00:00:00 2001 From: Redmar van den Berg Date: Tue, 5 Nov 2024 14:19:17 +0100 Subject: [PATCH] Transform the coverage counts for each strand for seAMLess --- includes/expression/Snakefile | 39 +++++++++++++++++++ .../expression/scripts/transform_counts.py | 4 +- test/test_expression.yml | 10 ++++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/includes/expression/Snakefile b/includes/expression/Snakefile index e93a13b..6da4359 100644 --- a/includes/expression/Snakefile +++ b/includes/expression/Snakefile @@ -4,6 +4,9 @@ include: "common.smk" rule all: input: multiqc="multiqc_expression.html", + unstranded=[ + f"{sample.sample}/expression/seAMLess.unstranded.tsv" for sample in samples + ], rule bed_coverage: @@ -53,6 +56,42 @@ rule normalized_coverage: """ +rule transform_counts: + """Transform the counts table to use with seAMLess""" + input: + counts=get_counts, + src=srcdir("scripts/transform_counts.py"), + output: + unstranded="{sample}/expression/seAMLess.unstranded.tsv", + forward="{sample}/expression/seAMLess.forward.tsv", + reverse_="{sample}/expression/seAMLess.reverse.tsv", + log: + "log/transform_counts.{sample}.txt", + threads: 1 + container: + containers["pysam"] + shell: + """ + python3 {input.src} \ + --counts {input.counts} \ + --strand unstranded \ + --sample {wildcards.sample} \ + > {output.unstranded} 2> {log} + + python3 {input.src} \ + --counts {input.counts} \ + --strand forward \ + --sample {wildcards.sample} \ + > {output.forward} 2>> {log} + + python3 {input.src} \ + --counts {input.counts} \ + --strand reverse\ + --sample {wildcards.sample} \ + > {output.reverse_} 2>> {log} + """ + + rule merge_samples: input: counts=[module_output.normalized_expression(sample) for sample in samples], diff --git a/includes/expression/scripts/transform_counts.py b/includes/expression/scripts/transform_counts.py index ae1948c..29c7587 100644 --- a/includes/expression/scripts/transform_counts.py +++ b/includes/expression/scripts/transform_counts.py @@ -22,7 +22,7 @@ def main(countfile, strand, sample): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--count", required=True, help="STAR counts file") + parser.add_argument("--counts", required=True, help="STAR counts file") parser.add_argument( "--strand", choices=["unstranded", "forward", "reverse"], @@ -33,4 +33,4 @@ def main(countfile, strand, sample): args = parser.parse_args() - main(args.count, args.strand, args.sample) + main(args.counts, args.strand, args.sample) diff --git a/test/test_expression.yml b/test/test_expression.yml index f6492c4..afb542a 100644 --- a/test/test_expression.yml +++ b/test/test_expression.yml @@ -12,7 +12,7 @@ stdout: contains: - "rule merge_samples" - + - "rule transform_counts" - name: Test error on unknown housekeeping gene tags: @@ -46,6 +46,14 @@ - path: SRR8615409/expression/coverage.csv contains: - "MT-CO2\t282\t158\t124" + # Test the transformed coverage counts to be used with seAMLess. This will + # only work if there are at least two samples, so we put the sample in + # there twice + - path: SRR8615409/expression/seAMLess.forward.tsv + - path: SRR8615409/expression/seAMLess.reverse.tsv + - path: SRR8615409/expression/seAMLess.unstranded.tsv + contains: + - "X\tSRR8615409\tSRR8615409" # Since MT-CO2 was used both as gene of interest and only housekeeping # gene, the normalized expression for every strand must be 1.0 # MT-CO2 was chosen specifically so it does not overlap any other