From efa03753975c561dfd0e52fa747b1eb6cc72e1fc Mon Sep 17 00:00:00 2001 From: Maria Katsantoni <31883096+mkatsanto@users.noreply.github.com> Date: Sun, 4 Dec 2022 22:01:27 +0100 Subject: [PATCH] feat: addition of snakemake compatible format and linting (#104) --- .github/workflows/ci.yml | 20 +- .gitignore | 1 + config/config.yaml | 18 + install/environment.dev.yml | 2 + tests/test_format/test_snakefmt.sh | 23 + tests/test_format/test_snakemake_lint.sh | 27 + tests/test_htsinfer_workflow/test.local.sh | 2 +- workflow/Snakefile | 2312 +++++++++----------- workflow/rules/common.smk | 32 + workflow/rules/htsinfer.smk | 83 +- workflow/rules/paired_end.snakefile.smk | 602 +++-- workflow/rules/single_end.snakefile.smk | 551 +++-- workflow/rules/sra_download.smk | 98 +- 13 files changed, 1791 insertions(+), 1980 deletions(-) create mode 100644 config/config.yaml create mode 100644 tests/test_format/test_snakefmt.sh create mode 100644 tests/test_format/test_snakemake_lint.sh create mode 100644 workflow/rules/common.smk diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b59cfc4..f968fe3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ on: jobs: - snakemake-graphs: + snakemake-graphs-format: runs-on: ubuntu-20.04 defaults: run: @@ -36,20 +36,30 @@ jobs: - name: Update zarp env with root. packages run: mamba env update -n zarp -f install/environment.root.yml + - name: Update zarp env with dev. packages + run: mamba env update -n zarp -f install/environment.dev.yml + - name: Display all miniconda & env info run: | conda info -a conda list + - name: Run test script for snakemake format + run: bash tests/test_format/test_snakefmt.sh + + - name: Run test script for snakemake lint + run: bash tests/test_format/test_snakemake_lint.sh + - name: Run test script for snakemake rulegraph run: bash tests/test_create_rule_graph/test.sh - name: Run test script for snakemake DAG run: bash tests/test_create_dag_image/test.sh + integration-singularity: needs: - - snakemake-graphs + - snakemake-graphs-format runs-on: ubuntu-20.04 defaults: run: @@ -86,7 +96,7 @@ jobs: integration-singularity-tempflag: needs: - - snakemake-graphs + - snakemake-graphs-format runs-on: ubuntu-20.04 defaults: run: @@ -123,7 +133,7 @@ jobs: integration-singularity-MultipleLanes: needs: - - snakemake-graphs + - snakemake-graphs-format runs-on: ubuntu-20.04 defaults: run: @@ -160,7 +170,7 @@ jobs: integration-conda: needs: - - snakemake-graphs + - snakemake-graphs-format runs-on: ubuntu-20.04 defaults: run: diff --git a/.gitignore b/.gitignore index babce25..d6e7e83 100644 --- a/.gitignore +++ b/.gitignore @@ -329,6 +329,7 @@ pip-selfcheck.json config/* !config/PUT_YOUR_WORKFLOW_RUN_CONFIGS_HERE !config/README.md +!config/config.yaml ._* ._.DS_Store .snakemake/ diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000..76d5400 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,18 @@ +--- + # Required fields + samples: "tests/input_files/samples.tsv" + output_dir: "results" + log_dir: "logs" + cluster_log_dir: "logs/cluster" + kallisto_indexes: "results/kallisto_indexes" + salmon_indexes: "results/salmon_indexes" + star_indexes: "results/star_indexes" + alfa_indexes: "results/alfa_indexes" + # Optional fields + rule_config: "tests/input_files/rule_config.yaml" + report_description: "No description provided by user" + report_logo: "images/logo.128px.png" + report_url: "https://zavolan.biozentrum.unibas.ch/" + author_name: "NA" + author_email: "NA" +... diff --git a/install/environment.dev.yml b/install/environment.dev.yml index 7f5e9c5..2dbcc5f 100644 --- a/install/environment.dev.yml +++ b/install/environment.dev.yml @@ -5,6 +5,8 @@ channels: dependencies: - bedtools=2.29.2 - biopython=1.76 + - snakefmt - unzip=6.0 + diff --git a/tests/test_format/test_snakefmt.sh b/tests/test_format/test_snakefmt.sh new file mode 100644 index 0000000..e966507 --- /dev/null +++ b/tests/test_format/test_snakefmt.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# This script is currently exiting with non-zero status. +# This is expected behaviour though, as several parameters can't be inferred from the test files. + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run tests +snakefmt --check ../../workflow diff --git a/tests/test_format/test_snakemake_lint.sh b/tests/test_format/test_snakemake_lint.sh new file mode 100644 index 0000000..65e89a3 --- /dev/null +++ b/tests/test_format/test_snakemake_lint.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# This script is currently exiting with non-zero status. +# This is expected behaviour though, as several parameters can't be inferred from the test files. + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run tests +snakemake \ + --snakefile="../../workflow/Snakefile" \ + --profile="../../profiles/local-singularity" \ + --configfile="../input_files/config.yaml" \ + --lint \ No newline at end of file diff --git a/tests/test_htsinfer_workflow/test.local.sh b/tests/test_htsinfer_workflow/test.local.sh index 5798f9f..ecca09e 100755 --- a/tests/test_htsinfer_workflow/test.local.sh +++ b/tests/test_htsinfer_workflow/test.local.sh @@ -13,7 +13,7 @@ cleanup () { rm -rf .java/ rm -rf .snakemake/ rm -rf logs/ - #rm -rf results/ + rm -rf results/ rm -rf Log.out cd $user_dir echo "Exit status: $rc" diff --git a/workflow/Snakefile b/workflow/Snakefile index 53ad0a7..2a37bcf 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -7,102 +7,82 @@ from shlex import quote from typing import Tuple from snakemake.utils import validate + +configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml") + + ## Preparations # Get sample table samples_table = pd.read_csv( - config['samples'], + config["samples"], header=0, index_col=0, - comment='#', - engine='python', + comment="#", + engine="python", sep="\t", ) # dict for translation of "directionality parameters" directionality_dict = { - "SF": - {"kallisto":"--fr-stranded", - "alfa": "fr-secondstrand", - "alfa_plus": "str1", - "alfa_minus": "str2"}, - "SR": - {"kallisto":"--rf-stranded", - "alfa": "fr-firststrand", - "alfa_plus": "str2", - "alfa_minus": "str1"}, + "SF": { + "kallisto": "--fr-stranded", + "alfa": "fr-secondstrand", + "alfa_plus": "str1", + "alfa_minus": "str2", + }, + "SR": { + "kallisto": "--rf-stranded", + "alfa": "fr-firststrand", + "alfa_plus": "str2", + "alfa_minus": "str1", + }, } # dict for alfa output" -alfa_dict = { - "MultimappersIncluded":"UniqueMultiple" , - "UniqueMappers":"Unique" -} +alfa_dict = {"MultimappersIncluded": "UniqueMultiple", "UniqueMappers": "Unique"} # Validate config validate(config, os.path.join("..", "resources", "config_schema.json")) -logger.info(f'Config file after validation: {config}') +logger.info(f"Config file after validation: {config}") # Parse YAML rule config file try: - with open(config['rule_config']) as _file: + with open(config["rule_config"]) as _file: rule_config = yaml.safe_load(_file) logger.info(f"Loaded rule_config from {config['rule_config']}.") except TypeError: - logger.error(f'No string supplied at field "rule_config", but: {type(config["rule_config"])} with content: {config["rule_config"]}') + logger.error( + f'No string supplied at field "rule_config", but: {type(config["rule_config"])} with content: {config["rule_config"]}' + ) raise except FileNotFoundError: - logger.error(f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! ") + logger.error( + f"No rule config file found at {config['rule_config']}. Either provide file or remove rule_config parameter from config.yaml! " + ) raise except KeyError: rule_config = {} logger.warning(f"No rule config specified: using default values for all tools.") -## Function definitions - -def get_sample(column_id, search_id=None, search_value=None): - """ Get relevant per sample information from samples table""" - if search_id: - if search_id == 'index': - return str(samples_table[column_id][samples_table.index == search_value][0]) - else: - return str(samples_table[column_id][samples_table[search_id] == search_value][0]) - else: - return str(samples_table[column_id][0]) - -def get_directionality(libtype, tool): - """ Get directionality value for different tools""" - directionality ="" - - for key in directionality_dict.keys(): - # Use the first of 'SF' or 'SR' that is found in libtype to look up directionality params for the current tool - if key in libtype: - directionality = directionality_dict[key][tool] - break - - # If libtype contains neither 'SF', nor 'SR' we don't know what to do - if not directionality: - raise ValueError( - f"Unknown libtype {libtype}.\n" - "Provide one of 'SF', 'SR', 'ISF', 'ISR', 'OSF', 'OSR', 'MSF', 'MSR' in samples.tsv!" - ) - - return directionality - -def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = ()): +def parse_rule_config( + rule_config: dict, current_rule: str, immutable: Tuple[str, ...] = () +): """Get rule specific parameters from rule_config file""" - + # If rule config file not present, emtpy string will be returned if not rule_config: logger.info(f"No rule config specified: using default values for all tools.") - return '' + return "" # Same if current rule not specified in rule config if current_rule not in rule_config or not rule_config[current_rule]: - logger.info(f"No additional parameters for rule {current_rule} specified: using default settings.") - return '' + logger.info( + f"No additional parameters for rule {current_rule} specified: using default settings." + ) + return "" # Subset only section for current rule rule_config = rule_config[current_rule] - + # Build list of parameters and values params_vals = [] for param, val in rule_config.items(): @@ -126,15 +106,23 @@ def parse_rule_config(rule_config: dict, current_rule: str, immutable: Tuple[str f"'{type(val).__name__}' for value of parameter '{param}'" ) # Return quoted string - add_params = ' '.join(quote(item) for item in params_vals) - logger.info(f"User specified additional parameters for rule {current_rule}:\n {add_params}") + add_params = " ".join(quote(item) for item in params_vals) + logger.info( + f"User specified additional parameters for rule {current_rule}:\n {add_params}" + ) return add_params # Global config -localrules: start, finish, rename_star_rpm_for_alfa, prepare_multiqc_config +localrules: + start, + finish, + rename_star_rpm_for_alfa, + prepare_multiqc_config, + # Include subworkflows +include: os.path.join("rules", "common.smk") include: os.path.join("rules", "paired_end.snakefile.smk") include: os.path.join("rules", "single_end.snakefile.smk") @@ -144,142 +132,133 @@ rule finish: Rule for collecting outputs """ input: - multiqc_report = os.path.join( - config['output_dir'], - "multiqc_summary"), - bigWig = expand( + multiqc_report=os.path.join(config["output_dir"], "multiqc_summary"), + bigWig=expand( os.path.join( config["output_dir"], "samples", "{sample}", "bigWig", "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.bw"), + "{sample}_{renamed_unique}_{strand}.bw", + ), sample=pd.unique(samples_table.index.values), strand=["plus", "minus"], - renamed_unique=alfa_dict.keys()), - salmon_merge_genes = expand( + renamed_unique=alfa_dict.keys(), + ), + salmon_merge_genes=expand( os.path.join( config["output_dir"], "summary_salmon", "quantmerge", - "genes_{salmon_merge_on}.tsv"), - salmon_merge_on=["tpm", "numreads"]), - salmon_merge_transcripts = expand( + "genes_{salmon_merge_on}.tsv", + ), + salmon_merge_on=["tpm", "numreads"], + ), + salmon_merge_transcripts=expand( os.path.join( config["output_dir"], "summary_salmon", "quantmerge", - "transcripts_{salmon_merge_on}.tsv"), - salmon_merge_on=["tpm", "numreads"]), - kallisto_merge_transcripts = os.path.join( - config["output_dir"], - "summary_kallisto", - "transcripts_tpm.tsv"), - kallisto_merge_genes = os.path.join( - config["output_dir"], - "summary_kallisto", - "genes_tpm.tsv") + "transcripts_{salmon_merge_on}.tsv", + ), + salmon_merge_on=["tpm", "numreads"], + ), + kallisto_merge_transcripts=os.path.join( + config["output_dir"], "summary_kallisto", "transcripts_tpm.tsv" + ), + kallisto_merge_genes=os.path.join( + config["output_dir"], "summary_kallisto", "genes_tpm.tsv" + ), + + +current_rule = "start" -current_rule = 'start' rule start: - ''' + """ Get samples - ''' + """ input: - reads = lambda wildcards: - expand( - pd.Series( - samples_table.loc[wildcards.sample, wildcards.mate] - ).values) - + reads=lambda wildcards: expand( + pd.Series(samples_table.loc[wildcards.sample, wildcards.mate]).values + ), output: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", "start", - "{sample}.{mate}.fastq.gz") - + "{sample}.{mate}.fastq.gz", + ), params: - cluster_log_path = config["cluster_log_dir"] - + cluster_log_path=config["cluster_log_dir"], log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{sample}.{mate}.stderr.log"), - stdout = os.path.join( + f"{current_rule}_{{sample}}.{{mate}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{sample}.{mate}.stdout.log") - - singularity: + f"{current_rule}_{{sample}}.{{mate}}.stdout.log", + ), + container: "docker://ubuntu:focal-20210416" - shell: "(cat {input.reads} > {output.reads}) \ 1> {log.stdout} 2> {log.stderr} " -current_rule = 'fastqc' +current_rule = "fastqc" + + rule fastqc: - ''' + """ A quality control tool for high throughput sequence data - ''' + """ input: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", "start", - "{sample}.{mate}.fastq.gz") - + "{sample}.{mate}.fastq.gz", + ), output: - outdir = directory( + outdir=directory( os.path.join( - config["output_dir"], - "samples", - "{sample}", - "fastqc", - "{mate}")) - - params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - '--outdir', - ) + config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" ) - + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("--outdir",) + ), threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - - singularity: + mem_mb=lambda wildcards, attempt: 4096 * attempt, + container: "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1" - conda: os.path.join(workflow.basedir, "envs", "fastqc.yaml") - log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{mate}.stderr.log"), - stdout = os.path.join( + f"{current_rule}_{{mate}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{mate}.stdout.log") - + f"{current_rule}_{{mate}}.stdout.log", + ), shell: "(mkdir -p {output.outdir}; \ fastqc --outdir {output.outdir} \ @@ -289,57 +268,53 @@ rule fastqc: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'fastqc_trimmed' +current_rule = "fastqc_trimmed" + + rule fastqc_trimmed: - ''' + """ A quality control tool for high throughput sequence data - ''' + """ input: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.{mate}.{seqmode}.remove_polya.fastq.gz") - + "{sample}.{mate}.{seqmode}.remove_polya.fastq.gz", + ), output: - outdir = directory( + outdir=directory( os.path.join( config["output_dir"], "samples", "{sample}", "fastqc_trimmed", - "{mate}_{seqmode}")) - - params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - '--outdir', - ) + "{mate}_{seqmode}", ) - + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("--outdir",) + ), threads: 1 - - singularity: + container: "docker://quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1" - conda: os.path.join(workflow.basedir, "envs", "fastqc.yaml") - log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{seqmode}_{mate}.stderr.log"), - stdout = os.path.join( + f"{current_rule}_{{seqmode}}_{{mate}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{seqmode}_{mate}.stdout.log") - + f"{current_rule}_{{seqmode}}_{{mate}}.stdout.log", + ), shell: "(mkdir -p {output.outdir}; \ fastqc --outdir {output.outdir} \ @@ -349,83 +324,68 @@ rule fastqc_trimmed: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'create_index_star' +current_rule = "create_index_star" + + rule create_index_star: """ Create index for STAR alignments """ input: - genome = lambda wildcards: - os.path.abspath(get_sample( - 'genome', - search_id='organism', - search_value=wildcards.organism)), - - gtf = lambda wildcards: - os.path.abspath(get_sample( - 'gtf', - search_id='organism', - search_value=wildcards.organism)) - + genome=lambda wildcards: os.path.abspath( + get_sample("genome", search_id="organism", search_value=wildcards.organism) + ), + gtf=lambda wildcards: os.path.abspath( + get_sample("gtf", search_id="organism", search_value=wildcards.organism) + ), output: - chromosome_info = os.path.join( - config['star_indexes'], + chromosome_info=os.path.join( + config["star_indexes"], "{organism}", "{index_size}", "STAR_index", - "chrNameLength.txt"), - chromosomes_names = os.path.join( - config['star_indexes'], + "chrNameLength.txt", + ), + chromosomes_names=os.path.join( + config["star_indexes"], "{organism}", "{index_size}", "STAR_index", - "chrName.txt") - + "chrName.txt", + ), params: - cluster_log_path = config["cluster_log_dir"], - output_dir = os.path.join( - config['star_indexes'], - "{organism}", - "{index_size}", - "STAR_index"), - outFileNamePrefix = os.path.join( - config['star_indexes'], - "{organism}", - "{index_size}", - "STAR_index/STAR_"), - sjdbOverhang = "{index_size}", - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.chromosome_info), + outFileNamePrefix=os.path.join( + config["star_indexes"], "{organism}", "{index_size}", "STAR_index/STAR_" + ), + sjdbOverhang="{index_size}", + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--runMode', - '--sjdbOverhang', - '--genomeDir', - '--genomeFastaFiles', - '--outFileNamePrefix', - '--sjdbGTFfile', - ) - ) - - singularity: + "--runMode", + "--sjdbOverhang", + "--genomeDir", + "--genomeFastaFiles", + "--outFileNamePrefix", + "--sjdbGTFfile", + ), + ), + container: "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" - conda: os.path.join(workflow.basedir, "envs", "STAR.yaml") - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt - + mem_mb=lambda wildcards, attempt: 32000 * attempt, log: - stderr = os.path.join( - config['log_dir'], - current_rule + "_{organism}_{index_size}.stderr.log"), - stdout = os.path.join( - config['log_dir'], - current_rule + "_{organism}_{index_size}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.stdout.log" + ), shell: "(mkdir -p {params.output_dir}; \ chmod -R 777 {params.output_dir}; \ @@ -441,57 +401,45 @@ rule create_index_star: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'extract_transcriptome' +current_rule = "extract_transcriptome" + + rule extract_transcriptome: """ Create transcriptome from genome and gene annotations """ input: - genome = lambda wildcards: - get_sample( - 'genome', - search_id='organism', - search_value=wildcards.organism), - gtf = lambda wildcards: - get_sample( - 'gtf', - search_id='organism', - search_value=wildcards.organism) + genome=lambda wildcards: get_sample( + "genome", search_id="organism", search_value=wildcards.organism + ), + gtf=lambda wildcards: get_sample( + "gtf", search_id="organism", search_value=wildcards.organism + ), output: - transcriptome = temp(os.path.join( - config['output_dir'], - "transcriptome", - "{organism}", - "transcriptome.fa")) - + transcriptome=temp( + os.path.join( + config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-w', - '-g', - ) - ) - - singularity: + "-w", + "-g", + ), + ), + container: "docker://quay.io/biocontainers/gffread:0.12.1--h2e03b76_1" - conda: os.path.join(workflow.basedir, "envs", "gffread.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stderr = os.path.join( - config['log_dir'], - current_rule + "_{organism}.log"), - stdout = os.path.join( - config['log_dir'], - current_rule + "_{organism}.log") - + stderr=os.path.join(config["log_dir"], f"{current_rule}_{{organism}}.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}_{{organism}}.log"), shell: "(gffread \ -w {output.transcriptome} \ @@ -501,111 +449,99 @@ rule extract_transcriptome: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'concatenate_transcriptome_and_genome' +current_rule = "concatenate_transcriptome_and_genome" + + rule concatenate_transcriptome_and_genome: """ Concatenate genome and transcriptome """ input: - transcriptome = os.path.join( - config['output_dir'], - "transcriptome", - "{organism}", - "transcriptome.fa"), - - genome = lambda wildcards: - get_sample( - 'genome', - search_id='organism', - search_value=wildcards.organism) - + transcriptome=os.path.join( + config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" + ), + genome=lambda wildcards: get_sample( + "genome", search_id="organism", search_value=wildcards.organism + ), output: - genome_transcriptome = temp(os.path.join( - config['output_dir'], - "transcriptome", - "{organism}", - "genome_transcriptome.fa")) - + genome_transcriptome=temp( + os.path.join( + config["output_dir"], + "transcriptome", + "{organism}", + "genome_transcriptome.fa", + ) + ), params: - cluster_log_path = config["cluster_log_dir"] - - singularity: + cluster_log_path=config["cluster_log_dir"], + container: "docker://ubuntu:focal-20210416" - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stderr = os.path.join( - config['log_dir'], - current_rule + "_{organism}.stderr.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}.stderr.log" + ), shell: "(cat {input.transcriptome} {input.genome} \ 1> {output.genome_transcriptome}) \ 2> {log.stderr}" -current_rule = 'create_index_salmon' +current_rule = "create_index_salmon" + + rule create_index_salmon: """ Create index for Salmon quantification """ input: - genome_transcriptome = os.path.join( - config['output_dir'], + genome_transcriptome=os.path.join( + config["output_dir"], "transcriptome", "{organism}", - "genome_transcriptome.fa"), - chr_names = lambda wildcards: - os.path.join( - config['star_indexes'], - get_sample('organism'), - get_sample("index_size"), - "STAR_index", - "chrName.txt") - + "genome_transcriptome.fa", + ), + chr_names=lambda wildcards: os.path.join( + config["star_indexes"], + get_sample("organism"), + get_sample("index_size"), + "STAR_index", + "chrName.txt", + ), output: - index = directory( + index=directory( os.path.join( - config['salmon_indexes'], - "{organism}", - "{kmer}", - "salmon.idx")) - + config["salmon_indexes"], "{organism}", "{kmer}", "salmon.idx" + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - kmerLen = "{kmer}", - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + kmerLen="{kmer}", + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--transcripts', - '--decoys', - '--index', - '--kmerLen', - ) - ) - - singularity: + "--transcripts", + "--decoys", + "--index", + "--kmerLen", + ), + ), + container: "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt - + mem_mb=lambda wildcards, attempt: 32000 * attempt, log: - stderr = os.path.join( - config['log_dir'], - current_rule + "_{organism}_{kmer}.stderr.log"), - stdout = os.path.join( - config['log_dir'], - current_rule + "_{organism}_{kmer}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{kmer}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}_{{kmer}}.stdout.log" + ), shell: "(salmon index \ --transcripts {input.genome_transcriptome} \ @@ -617,54 +553,38 @@ rule create_index_salmon: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'create_index_kallisto' +current_rule = "create_index_kallisto" + + rule create_index_kallisto: """ Create index for Kallisto quantification """ input: - transcriptome = os.path.join( - config['output_dir'], - "transcriptome", - "{organism}", - "transcriptome.fa") - + transcriptome=os.path.join( + config["output_dir"], "transcriptome", "{organism}", "transcriptome.fa" + ), output: - index = os.path.join( - config['kallisto_indexes'], - "{organism}", - "kallisto.idx") - + index=os.path.join(config["kallisto_indexes"], "{organism}", "kallisto.idx"), params: - cluster_log_path = config["cluster_log_dir"], - output_dir = os.path.join( - config['kallisto_indexes'], - "{organism}"), - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - '-i', - ) - ) - - singularity: + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.index), + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("-i",) + ), + container: "docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2" - conda: os.path.join(workflow.basedir, "envs", "kallisto.yaml") - resources: - mem_mb=lambda wildcards, attempt: 8192 * attempt - + mem_mb=lambda wildcards, attempt: 8192 * attempt, log: - stderr = os.path.join( - config['log_dir'], - current_rule + "_{organism}.stderr.log"), - stdout = os.path.join( - config['log_dir'], - current_rule + "_{organism}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{organism}}.stdout.log" + ), shell: "(mkdir -p {params.output_dir}; \ chmod -R 777 {params.output_dir}; \ @@ -675,50 +595,39 @@ rule create_index_kallisto: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'extract_transcripts_as_bed12' +current_rule = "extract_transcripts_as_bed12" + + rule extract_transcripts_as_bed12: """ Convert transcripts to BED12 format """ input: - gtf = lambda wildcards: - get_sample('gtf') - + gtf=lambda wildcards: get_sample("gtf"), output: - bed12 = temp(os.path.join( - config['output_dir'], - "full_transcripts_protein_coding.bed")) - + bed12=temp( + os.path.join(config["output_dir"], "full_transcripts_protein_coding.bed") + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--gtf', - '--bed12', - ) - ) - - singularity: + "--gtf", + "--bed12", + ), + ), + container: "docker://quay.io/biocontainers/zgtf:0.1.1--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "zgtf.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stdout = os.path.join( - config['log_dir'], - current_rule + ".stdout.log"), - stderr = os.path.join( - config['log_dir'], - current_rule + ".stderr.log") - + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), shell: "(gtf2bed12 \ --gtf {input.gtf} \ @@ -727,58 +636,54 @@ rule extract_transcripts_as_bed12: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'sort_genomic_alignment_samtools' +current_rule = "sort_genomic_alignment_samtools" + + rule sort_genomic_alignment_samtools: - ''' + """ Sort genome bamfile using samtools - ''' + """ input: - bam = os.path.join( + bam=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.{seqmode}.Aligned.out.bam"), - + "{sample}.{seqmode}.Aligned.out.bam", + ), output: - bam = os.path.join( + bam=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"), - + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=() - ) - - singularity: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=() + ), + container: "docker://quay.io/biocontainers/samtools:1.9--h10a08f8_12" - conda: os.path.join(workflow.basedir, "envs", "samtools.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt: 8000 * attempt - + mem_mb=lambda wildcards, attempt: 8000 * attempt, log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + ".{seqmode}.stderr.log"), - stdout = os.path.join( + f"{current_rule}.{{seqmode}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + ".{seqmode}.stdout.log") - + f"{current_rule}.{{seqmode}}.stdout.log", + ), shell: "(samtools sort \ -o {output.bam} \ @@ -788,58 +693,54 @@ rule sort_genomic_alignment_samtools: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'index_genomic_alignment_samtools' +current_rule = "index_genomic_alignment_samtools" + + rule index_genomic_alignment_samtools: - ''' + """ Index genome bamfile using samtools - ''' + """ input: - bam = os.path.join( + bam=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam") - + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), output: - bai = os.path.join( + bai=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai") - + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=() - ) - - singularity: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=() + ), + container: "docker://quay.io/biocontainers/samtools:1.3.1--h1b8c3c0_8" - conda: os.path.join(workflow.basedir, "envs", "samtools.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + ".{seqmode}.stderr.log"), - stdout = os.path.join( + f"{current_rule}.{{seqmode}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + ".{seqmode}.stdout.log") - + f"{current_rule}.{{seqmode}}.stdout.log", + ), shell: "(samtools index \ {params.additional_params} \ @@ -847,81 +748,72 @@ rule index_genomic_alignment_samtools: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'calculate_TIN_scores' +current_rule = "calculate_TIN_scores" + + rule calculate_TIN_scores: """ Calculate transcript integrity (TIN) score """ input: - bam = lambda wildcards: - expand( - os.path.join( - config['output_dir'], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"), - sample=wildcards.sample, - seqmode=get_sample( - 'seqmode', - search_id='index', - search_value=wildcards.sample)), - bai = lambda wildcards: - expand( - os.path.join( - config['output_dir'], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"), - sample=wildcards.sample, - seqmode=get_sample( - 'seqmode', - search_id='index', - search_value=wildcards.sample)), - transcripts_bed12 = os.path.join( - config['output_dir'], - "full_transcripts_protein_coding.bed") - - output: - TIN_score = temp(os.path.join( - config['output_dir'], - "samples", - "{sample}", - "TIN", - "TIN_score.tsv")) - + bam=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + bai=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + transcripts_bed12=os.path.join( + config["output_dir"], "full_transcripts_protein_coding.bed" + ), + output: + TIN_score=temp( + os.path.join( + config["output_dir"], "samples", "{sample}", "TIN", "TIN_score.tsv" + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - sample = "{sample}", - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + sample="{sample}", + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-i', - '-r', - '--names', - ) - ) - - singularity: + "-i", + "-r", + "--names", + ), + ), + container: "docker://quay.io/biocontainers/tin-score-calculation:0.6.3--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "tin-score-calculation.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.bam) * 1024 * attempt - + mem_mb=lambda wildcards, attempt, input: len(input.bam) * 1024 * attempt, log: - stderr = os.path.join( - config['log_dir'], - "samples", - "{sample}", - current_rule + ".log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", f"{current_rule}.log" + ), shell: "(calculate-tin.py \ -i {input.bam} \ @@ -932,85 +824,82 @@ rule calculate_TIN_scores: > {output.TIN_score};) 2> {log.stderr}" -current_rule = 'salmon_quantmerge_genes' +current_rule = "salmon_quantmerge_genes" + + rule salmon_quantmerge_genes: - ''' + """ Merge gene quantifications into a single file - ''' + """ input: - salmon_in = expand( + salmon_in=expand( os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.{seqmode}", - "quant.sf"), + "quant.sf", + ), zip, sample=pd.unique(samples_table.index.values), - seqmode=[get_sample( - 'seqmode', - search_id='index', - search_value=i) - for i in pd.unique(samples_table.index.values)]) - + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), output: - salmon_out = os.path.join( + salmon_out=os.path.join( config["output_dir"], "summary_salmon", "quantmerge", - "genes_{salmon_merge_on}.tsv") - + "genes_{salmon_merge_on}.tsv", + ), params: - cluster_log_path = config["cluster_log_dir"], - salmon_in = expand( + cluster_log_path=config["cluster_log_dir"], + salmon_in=expand( os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.salmon.{seqmode}"), + "{sample}.salmon.{seqmode}", + ), zip, sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[get_sample( - 'seqmode', - search_id='index', - search_value=i) - for i in pd.unique(samples_table.index.values)]), - sample_name_list = expand( - "{sample}", - sample=pd.unique(samples_table.index.values)), - salmon_merge_on = "{salmon_merge_on}", - additional_params = parse_rule_config( + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + sample_name_list=expand( + "{sample}", sample=pd.unique(samples_table.index.values) + ), + salmon_merge_on="{salmon_merge_on}", + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--quants', - '--genes', - '--transcripts', - '--names', - '--column', - '--output', - ) - ) - - singularity: + "--quants", + "--genes", + "--transcripts", + "--names", + "--column", + "--output", + ), + ), + container: "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1024 * attempt - + mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1024 * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + "_{salmon_merge_on}.stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + "_{salmon_merge_on}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stdout.log" + ), shell: "(salmon quantmerge \ --quants {params.salmon_in} \ @@ -1022,84 +911,82 @@ rule salmon_quantmerge_genes: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'salmon_quantmerge_transcripts' +current_rule = "salmon_quantmerge_transcripts" + + rule salmon_quantmerge_transcripts: - ''' + """ Merge transcript quantifications into a single file - ''' + """ input: - salmon_in = expand( + salmon_in=expand( os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.{seqmode}", - "quant.sf"), + "quant.sf", + ), zip, sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[get_sample( - 'seqmode', - search_id='index', - search_value=i) - for i in pd.unique(samples_table.index.values)]) - + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), output: - salmon_out = os.path.join( + salmon_out=os.path.join( config["output_dir"], "summary_salmon", "quantmerge", - "transcripts_{salmon_merge_on}.tsv") - + "transcripts_{salmon_merge_on}.tsv", + ), params: - cluster_log_path = config["cluster_log_dir"], - salmon_in = expand( + cluster_log_path=config["cluster_log_dir"], + salmon_in=expand( os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.salmon.{seqmode}"), + "{sample}.salmon.{seqmode}", + ), zip, sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[get_sample( - 'seqmode', - search_id='index', - search_value=i) - for i in pd.unique(samples_table.index.values)]), - sample_name_list = expand( - "{sample}", - sample=pd.unique(samples_table.index.values)), - salmon_merge_on = "{salmon_merge_on}", - additional_params = parse_rule_config( + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + sample_name_list=expand( + "{sample}", sample=pd.unique(samples_table.index.values) + ), + salmon_merge_on="{salmon_merge_on}", + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--quants', - '--genes', - '--transcripts', - '--names', - '--column', - '--output', - ) - ) - singularity: + "--quants", + "--genes", + "--transcripts", + "--names", + "--column", + "--output", + ), + ), + container: "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1000 * attempt - + mem_mb=lambda wildcards, attempt, input: len(input.salmon_in) * 1000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + "_{salmon_merge_on}.stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + "_{salmon_merge_on}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{salmon_merge_on}}.stdout.log" + ), shell: "(salmon quantmerge \ --quants {params.salmon_in} \ @@ -1110,85 +997,76 @@ rule salmon_quantmerge_transcripts: 1> {log.stdout} 2> {log.stderr}" -current_rule= 'kallisto_merge_genes' +current_rule = "kallisto_merge_genes" + + rule kallisto_merge_genes: - ''' + """ Merge gene quantifications into single file - ''' + """ input: - pseudoalignment = expand( + pseudoalignment=expand( os.path.join( config["output_dir"], "samples", "{sample}", "quant_kallisto", - "{sample}.{seqmode}.kallisto.pseudo.sam"), + "{sample}.{seqmode}.kallisto.pseudo.sam", + ), zip, sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[get_sample( - 'seqmode', - search_id='index', - search_value=i) - for i in pd.unique(samples_table.index.values)]), - gtf = get_sample('gtf') - + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + gtf=get_sample("gtf"), output: - gn_tpm = os.path.join( - config["output_dir"], - "summary_kallisto", - "genes_tpm.tsv"), - gn_counts = os.path.join( - config["output_dir"], - "summary_kallisto", - "genes_counts.tsv") - + gn_tpm=os.path.join(config["output_dir"], "summary_kallisto", "genes_tpm.tsv"), + gn_counts=os.path.join( + config["output_dir"], "summary_kallisto", "genes_counts.tsv" + ), params: - cluster_log_path = config["cluster_log_dir"], - dir_out = os.path.join( - config["output_dir"], - "summary_kallisto"), - tables = ','.join(expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "abundance.h5"), - sample=[i for i in pd.unique(samples_table.index.values)])), - sample_name_list = ','.join(expand( - "{sample}", - sample=pd.unique(samples_table.index.values))), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + dir_out=lambda wildcards, output: os.path.dirname(output.gn_counts), + tables=",".join( + expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "abundance.h5", + ), + sample=[i for i in pd.unique(samples_table.index.values)], + ) + ), + sample_name_list=",".join( + expand("{sample}", sample=pd.unique(samples_table.index.values)) + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--input', - '--names', - '--txOut', - '--anno', - '--output', - ) - ) - - singularity: + "--input", + "--names", + "--txOut", + "--anno", + "--output", + ), + ), + container: "docker://quay.io/biocontainers/r-merge-kallisto:0.6--hdfd78af_0" - conda: os.path.join(workflow.basedir, "envs", "r-merge-kallisto.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) * 1000 * attempt - + mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) + * 1000 + * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + ".stdout.log") - + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), shell: "(merge_kallisto.R \ --input {params.tables} \ @@ -1200,83 +1078,76 @@ rule kallisto_merge_genes: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'kallisto_merge_transcripts' +current_rule = "kallisto_merge_transcripts" + + rule kallisto_merge_transcripts: - ''' + """ Merge transcript quantifications into a single files - ''' + """ input: - pseudoalignment = expand( + pseudoalignment=expand( os.path.join( config["output_dir"], "samples", "{sample}", "quant_kallisto", - "{sample}.{seqmode}.kallisto.pseudo.sam"), + "{sample}.{seqmode}.kallisto.pseudo.sam", + ), zip, sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[get_sample( - 'seqmode', - search_id='index', - search_value=i) - for i in pd.unique(samples_table.index.values)]), - + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), output: - tx_tpm = os.path.join( - config["output_dir"], - "summary_kallisto", - "transcripts_tpm.tsv"), - tx_counts = os.path.join( - config["output_dir"], - "summary_kallisto", - "transcripts_counts.tsv") - + tx_tpm=os.path.join( + config["output_dir"], "summary_kallisto", "transcripts_tpm.tsv" + ), + tx_counts=os.path.join( + config["output_dir"], "summary_kallisto", "transcripts_counts.tsv" + ), params: - cluster_log_path = config["cluster_log_dir"], - dir_out = os.path.join( - config["output_dir"], - "summary_kallisto"), - tables = ','.join(expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto", - "abundance.h5"), - sample=[i for i in pd.unique(samples_table.index.values)])), - sample_name_list = ','.join(expand( - "{sample}", - sample=pd.unique(samples_table.index.values))), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + dir_out=lambda wildcards, output: os.path.dirname(output.tx_counts), + tables=",".join( + expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "quant_kallisto", + "abundance.h5", + ), + sample=[i for i in pd.unique(samples_table.index.values)], + ) + ), + sample_name_list=",".join( + expand("{sample}", sample=pd.unique(samples_table.index.values)) + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--input', - '--names', - '--txOut', - '--output', - ) - ) - - singularity: + "--input", + "--names", + "--txOut", + "--output", + ), + ), + container: "docker://quay.io/biocontainers/r-merge-kallisto:0.6--hdfd78af_0" - conda: os.path.join(workflow.basedir, "envs", "r-merge-kallisto.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) * 1024 * attempt - + mem_mb=lambda wildcards, attempt, input: len(input.pseudoalignment) + * 1024 + * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + ".stdout.log") - + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), shell: "(merge_kallisto.R \ --input {params.tables} \ @@ -1286,51 +1157,42 @@ rule kallisto_merge_transcripts: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'pca_salmon' +current_rule = "pca_salmon" + + rule pca_salmon: input: - tpm = os.path.join( - config["output_dir"], - "summary_salmon", - "quantmerge", - "{molecule}_tpm.tsv"), - + tpm=os.path.join( + config["output_dir"], "summary_salmon", "quantmerge", "{molecule}_tpm.tsv" + ), output: - out = directory(os.path.join( - config["output_dir"], - "zpca", - "pca_salmon_{molecule}")) - + out=directory( + os.path.join(config["output_dir"], "zpca", "pca_salmon_{molecule}") + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--tpm', - '--out', - ) - ) - - singularity: + "--tpm", + "--out", + ), + ), + container: "docker://quay.io/biocontainers/zpca:0.8.3.post1--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "zpca.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + "_{molecule}.stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + "_{molecule}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stdout.log" + ), shell: "(zpca-tpm \ --tpm {input.tpm} \ @@ -1339,50 +1201,40 @@ rule pca_salmon: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'pca_kallisto' +current_rule = "pca_kallisto" + + rule pca_kallisto: input: - tpm = os.path.join( - config["output_dir"], - "summary_kallisto", - "{molecule}_tpm.tsv") - + tpm=os.path.join(config["output_dir"], "summary_kallisto", "{molecule}_tpm.tsv"), output: - out = directory(os.path.join( - config["output_dir"], - "zpca", - "pca_kallisto_{molecule}")) - + out=directory( + os.path.join(config["output_dir"], "zpca", "pca_kallisto_{molecule}") + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--tpm', - '--out', - ) - ) - - singularity: + "--tpm", + "--out", + ), + ), + container: "docker://quay.io/biocontainers/zpca:0.8.3.post1--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "zpca.yaml") - threads: 1 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + "_{molecule}.stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + "_{molecule}.stdout.log") - + stderr=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], f"{current_rule}_{{molecule}}.stdout.log" + ), shell: "(zpca-tpm \ --tpm {input.tpm} \ @@ -1391,109 +1243,109 @@ rule pca_kallisto: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'star_rpm' +current_rule = "star_rpm" + + rule star_rpm: - ''' + """ Create stranded bedgraph coverage with STARs RPM normalisation - ''' + """ input: - bam = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam"), - sample=wildcards.sample, - seqmode=get_sample( - 'seqmode', - search_id='index', - search_value=wildcards.sample)), - bai = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai"), - sample=wildcards.sample, - seqmode=get_sample( - 'seqmode', - search_id='index', - search_value=wildcards.sample)) - + bam=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), + bai=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "map_genome", + "{sample}.{seqmode}.Aligned.sortedByCoord.out.bam.bai", + ), + sample=wildcards.sample, + seqmode=get_sample( + "seqmode", search_id="index", search_value=wildcards.sample + ), + ), output: - str1 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.Unique.str1.out.bg")), - str2 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.UniqueMultiple.str1.out.bg")), - str3 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.Unique.str2.out.bg")), - str4 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.UniqueMultiple.str2.out.bg")) - - shadow: "full" - - params: - cluster_log_path = config["cluster_log_dir"], - out_dir = lambda wildcards, output: - os.path.dirname(output.str1), - prefix = lambda wildcards, output: + str1=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.Unique.str1.out.bg", + ) + ), + str2=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.UniqueMultiple.str1.out.bg", + ) + ), + str3=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.Unique.str2.out.bg", + ) + ), + str4=temp( os.path.join( - os.path.dirname(output.str1), - str(wildcards.sample) + "_"), - additional_params = parse_rule_config( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.UniqueMultiple.str2.out.bg", + ) + ), + shadow: + "full" + params: + cluster_log_path=config["cluster_log_dir"], + out_dir=lambda wildcards, output: os.path.dirname(output.str1), + prefix=lambda wildcards, output: os.path.join( + os.path.dirname(output.str1), f"{str(wildcards.sample)}_" + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--runMode', - '--inputBAMfile', - '--outWigType', - '--outFileNamePrefix', - ) - ) - - singularity: + "--runMode", + "--inputBAMfile", + "--outWigType", + "--outFileNamePrefix", + ), + ), + container: "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" - conda: os.path.join(workflow.basedir, "envs", "STAR.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 8192 * attempt - + mem_mb=lambda wildcards, attempt: 8192 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stdout.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", f"{current_rule}_.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", f"{current_rule}_.stdout.log" + ), shell: "(mkdir -p {params.out_dir}; \ chmod -R 777 {params.out_dir}; \ @@ -1507,108 +1359,121 @@ rule star_rpm: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'rename_star_rpm_for_alfa' +current_rule = "rename_star_rpm_for_alfa" + + rule rename_star_rpm_for_alfa: input: - plus = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.{unique}.{plus}.out.bg"), - sample=wildcards.sample, - unique=alfa_dict[wildcards.renamed_unique], - plus=get_directionality(get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample),"alfa_plus")), - minus = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "STAR_coverage", - "{sample}_Signal.{unique}.{minus}.out.bg"), - sample=wildcards.sample, - unique=alfa_dict[wildcards.renamed_unique], - minus=get_directionality(get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample), "alfa_minus")) - - output: - plus = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.{renamed_unique}.plus.bg")), - minus = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.{renamed_unique}.minus.bg")) - + plus=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.{unique}.{plus}.out.bg", + ), + sample=wildcards.sample, + unique=alfa_dict[wildcards.renamed_unique], + plus=get_directionality( + get_sample( + "libtype", search_id="index", search_value=wildcards.sample + ), + "alfa_plus", + ), + ), + minus=lambda wildcards: expand( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "STAR_coverage", + "{sample}_Signal.{unique}.{minus}.out.bg", + ), + sample=wildcards.sample, + unique=alfa_dict[wildcards.renamed_unique], + minus=get_directionality( + get_sample( + "libtype", search_id="index", search_value=wildcards.sample + ), + "alfa_minus", + ), + ), + output: + plus=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.{renamed_unique}.plus.bg", + ) + ), + minus=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "ALFA", + "{renamed_unique}", + "{sample}.{renamed_unique}.minus.bg", + ) + ), params: - cluster_log_path = config["cluster_log_dir"] - - singularity: + cluster_log_path=config["cluster_log_dir"], + container: "docker://ubuntu:focal-20210416" - log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{renamed_unique}.stderr.log"), - stdout = os.path.join( + f"{current_rule}_{{renamed_unique}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{renamed_unique}.stdout.log") - + f"{current_rule}_{{renamed_unique}}.stdout.log", + ), shell: "(cp {input.plus} {output.plus}; \ - cp {input.minus} {output.minus};) \ - 1>{log.stdout} 2>{log.stderr}" + cp {input.minus} {output.minus};) \ + 1>{log.stdout} 2>{log.stderr}" + + +current_rule = "generate_alfa_index" -current_rule = 'generate_alfa_index' rule generate_alfa_index: - ''' Generate ALFA index files from sorted GTF file ''' + """ Generate ALFA index files from sorted GTF file """ input: - gtf = lambda wildcards: - os.path.abspath(get_sample( - 'gtf', - search_id='organism', - search_value=wildcards.organism)), - chr_len = os.path.join( + gtf=lambda wildcards: os.path.abspath( + get_sample("gtf", search_id="organism", search_value=wildcards.organism) + ), + chr_len=os.path.join( config["star_indexes"], "{organism}", "{index_size}", "STAR_index", - "chrNameLength.txt"), - + "chrNameLength.txt", + ), output: - index_stranded = os.path.join( + index_stranded=os.path.join( config["alfa_indexes"], "{organism}", "{index_size}", "ALFA", - "sorted_genes.stranded.ALFA_index"), - index_unstranded = os.path.join( + "sorted_genes.stranded.ALFA_index", + ), + index_unstranded=os.path.join( config["alfa_indexes"], "{organism}", "{index_size}", "ALFA", - "sorted_genes.unstranded.ALFA_index"), - temp_dir = temp( + "sorted_genes.unstranded.ALFA_index", + ), + temp_dir=temp( directory( os.path.join( config["alfa_indexes"], @@ -1617,40 +1482,32 @@ rule generate_alfa_index: "ALFA_temp", ) ) - ) - + ), params: - cluster_log_path = config["cluster_log_dir"], - genome_index = "sorted_genes", - out_dir = lambda wildcards, output: - os.path.dirname(output.index_stranded), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + genome_index="sorted_genes", + out_dir=lambda wildcards, output: os.path.dirname(output.index_stranded), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-a', - '-g', - '--chr_len', - '-o', - ) - ) - - singularity: + "-a", + "-g", + "--chr_len", + "-o", + ), + ), + container: "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "alfa.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: os.path.join( - config["log_dir"], - current_rule + "_{organism}_{index_size}.log") - + config["log_dir"], f"{current_rule}_{{organism}}_{{index_size}}.log" + ), shell: "(mkdir -p {output.temp_dir}; \ alfa -a {input.gtf} \ @@ -1663,69 +1520,67 @@ rule generate_alfa_index: &> {log}" -current_rule = 'alfa_qc' +current_rule = "alfa_qc" + + rule alfa_qc: - ''' + """ Run ALFA from stranded bedgraph files - ''' + """ input: - plus = lambda wildcards: + plus=lambda wildcards: os.path.join( + config["output_dir"], + "samples", + wildcards.sample, + "ALFA", + wildcards.renamed_unique, + f"{wildcards.sample}.{wildcards.renamed_unique}.plus.bg", + ), + minus=lambda wildcards: os.path.join( + config["output_dir"], + "samples", + wildcards.sample, + "ALFA", + wildcards.renamed_unique, + f"{wildcards.sample}.{wildcards.renamed_unique}.minus.bg", + ), + gtf=lambda wildcards: os.path.join( + config["alfa_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("index_size", search_id="index", search_value=wildcards.sample), + "ALFA", + "sorted_genes.stranded.ALFA_index", + ), + output: + biotypes=temp( os.path.join( config["output_dir"], "samples", - wildcards.sample, + "{sample}", "ALFA", - wildcards.renamed_unique, - wildcards.sample + "." + wildcards.renamed_unique + ".plus.bg" - ), - - minus = lambda wildcards: + "{renamed_unique}", + "ALFA_plots.Biotypes.pdf", + ) + ), + categories=temp( os.path.join( config["output_dir"], "samples", - wildcards.sample, - "ALFA", - wildcards.renamed_unique, - wildcards.sample + "." + wildcards.renamed_unique + ".minus.bg" - ), - - gtf = lambda wildcards: - os.path.join( - config["alfa_indexes"], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - get_sample( - 'index_size', - search_id='index', - search_value=wildcards.sample), + "{sample}", "ALFA", - "sorted_genes.stranded.ALFA_index") - - output: - biotypes = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "ALFA_plots.Biotypes.pdf")), - categories = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "ALFA_plots.Categories.pdf")), - table = os.path.join( + "{renamed_unique}", + "ALFA_plots.Categories.pdf", + ) + ), + table=os.path.join( config["output_dir"], "samples", "{sample}", "ALFA", "{renamed_unique}", - "{sample}.ALFA_feature_counts.tsv"), - temp_dir = temp( + "{sample}.ALFA_feature_counts.tsv", + ), + temp_dir=temp( directory( os.path.join( config["output_dir"], @@ -1733,58 +1588,48 @@ rule alfa_qc: "{sample}", "ALFA", "{renamed_unique}", - "ALFA_temp") + "ALFA_temp", + ) ) - ) - + ), params: - cluster_log_path = config["cluster_log_dir"], - out_dir = lambda wildcards, output: - os.path.dirname(output.biotypes), - genome_index = lambda wildcards, input: - os.path.abspath( - os.path.join( - os.path.dirname(input.gtf), - "sorted_genes")), - plus = lambda wildcards, input: - os.path.basename(input.plus), - minus = lambda wildcards, input: - os.path.basename(input.minus), - name = "{sample}", - alfa_orientation = lambda wildcards: - get_directionality(get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample),"alfa"), - temp_dir = lambda wildcards, output: - os.path.abspath( - os.path.dirname(output.temp_dir)), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + out_dir=lambda wildcards, output: os.path.dirname(output.biotypes), + genome_index=lambda wildcards, input: os.path.abspath( + os.path.join(os.path.dirname(input.gtf), "sorted_genes") + ), + plus=lambda wildcards, input: os.path.basename(input.plus), + minus=lambda wildcards, input: os.path.basename(input.minus), + name="{sample}", + alfa_orientation=lambda wildcards: get_directionality( + get_sample("libtype", search_id="index", search_value=wildcards.sample), + "alfa", + ), + temp_dir=lambda wildcards, output: os.path.abspath( + os.path.dirname(output.temp_dir) + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-g', - '--bedgraph', - '-s', - ) - ) - - singularity: + "-g", + "--bedgraph", + "-s", + ), + ), + container: "docker://quay.io/biocontainers/alfa:1.1.1--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "alfa.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + ".{renamed_unique}.log") - + f"{current_rule}.{{renamed_unique}}.log", + ), shell: "(mkdir -p {output.temp_dir};\ cd {params.out_dir}; \ @@ -1797,51 +1642,43 @@ rule alfa_qc: &> {log}" -current_rule = 'prepare_multiqc_config' +current_rule = "prepare_multiqc_config" + + rule prepare_multiqc_config: - ''' + """ Prepare config for the MultiQC - ''' + """ input: - script = os.path.join( - workflow.basedir, - "scripts", - "zarp_multiqc_config.py") - + script=os.path.join(workflow.basedir, "scripts", "zarp_multiqc_config.py"), output: - multiqc_config = os.path.join( - config["output_dir"], - "multiqc_config.yaml") - + multiqc_config=os.path.join(config["output_dir"], "multiqc_config.yaml"), params: - cluster_log_path = config["cluster_log_dir"], - logo_path = config['report_logo'] if 'report_logo' in config else '', - multiqc_intro_text = config['report_description'], - url = config['report_url'] if 'report_url' in config else '', - author_name = config['author_name'], - author_email = config['author_email'], - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + logo_path=config["report_logo"] if "report_logo" in config else "", + multiqc_intro_text=config["report_description"], + url=config["report_url"] if "report_url" in config else "", + author_name=config["author_name"], + author_email=config["author_email"], + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--config', - '--intro-text', - '--custom-logo', - '--url', - ) - ) - + "--config", + "--intro-text", + "--custom-logo", + "--url", + ), + ), log: - stderr = os.path.join( - config["log_dir"], - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + ".stdout.log") - + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), + container: + "docker://quay.io/biocontainers/zavolan-multiqc-plugins:1.3--pyh5e36f6f_0" + conda: + os.path.join(workflow.basedir, "envs", "zavolan-multiqc-plugins.yaml") resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, shell: "(python {input.script} \ --config {output.multiqc_config} \ @@ -1853,144 +1690,130 @@ rule prepare_multiqc_config: {params.additional_params}) \ 1> {log.stdout} 2> {log.stderr}" -current_rule = 'multiqc_report' + +current_rule = "multiqc_report" + + rule multiqc_report: - ''' + """ Create report with MultiQC - ''' + """ input: - fastqc_se = expand( + fastqc_se=expand( os.path.join( - config['output_dir'], - "samples", - "{sample}", - "fastqc", - "{mate}"), + config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" + ), sample=pd.unique(samples_table.index.values), - mate="fq1"), - - fastqc_pe = expand( + mate="fq1", + ), + fastqc_pe=expand( os.path.join( - config['output_dir'], - "samples", - "{sample}", - "fastqc", - "{mate}"), - sample=[i for i in pd.unique( - samples_table[samples_table['seqmode'] == 'pe'].index.values)], - mate="fq2"), - - fastqc_trimmed_se = expand( + config["output_dir"], "samples", "{sample}", "fastqc", "{mate}" + ), + sample=[ + i + for i in pd.unique( + samples_table[samples_table["seqmode"] == "pe"].index.values + ) + ], + mate="fq2", + ), + fastqc_trimmed_se=expand( os.path.join( - config['output_dir'], + config["output_dir"], "samples", "{sample}", "fastqc_trimmed", - "fq1_{seqmode}"), + "fq1_{seqmode}", + ), zip, sample=pd.unique(samples_table.index.values), - seqmode=[get_sample('seqmode', search_id='index', search_value=i) - for i in pd.unique(samples_table.index.values)]), - - fastqc_trimmed_pe = expand( + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + fastqc_trimmed_pe=expand( os.path.join( - config['output_dir'], + config["output_dir"], "samples", "{sample}", "fastqc_trimmed", - "{mate}_pe"), - sample=[i for i in pd.unique( - samples_table[samples_table['seqmode'] == 'pe'].index.values)], - mate='fq2' + "{mate}_pe", ), - - pseudoalignment = expand( + sample=[ + i + for i in pd.unique( + samples_table[samples_table["seqmode"] == "pe"].index.values + ) + ], + mate="fq2", + ), + pseudoalignment=expand( os.path.join( - config['output_dir'], + config["output_dir"], "samples", "{sample}", "quant_kallisto", - "{sample}.{seqmode}.kallisto.pseudo.sam"), + "{sample}.{seqmode}.kallisto.pseudo.sam", + ), zip, sample=[i for i in pd.unique(samples_table.index.values)], - seqmode=[get_sample('seqmode', search_id='index', search_value=i) - for i in pd.unique(samples_table.index.values)]), - - TIN_score = expand( + seqmode=[ + get_sample("seqmode", search_id="index", search_value=i) + for i in pd.unique(samples_table.index.values) + ], + ), + TIN_score=expand( + os.path.join( + config["output_dir"], "samples", "{sample}", "TIN", "TIN_score.tsv" + ), + sample=pd.unique(samples_table.index.values), + ), + tables=lambda wildcards: expand( os.path.join( - config['output_dir'], + config["output_dir"], "samples", "{sample}", - "TIN", - "TIN_score.tsv"), - sample=pd.unique(samples_table.index.values)), - - tables = lambda wildcards: - expand( - os.path.join( - config["output_dir"], - "samples", - "{sample}", - "ALFA", - "{renamed_unique}", - "{sample}.ALFA_feature_counts.tsv"), - sample=pd.unique(samples_table.index.values), - renamed_unique=alfa_dict.keys()), - - zpca_salmon = expand(os.path.join( - config["output_dir"], - "zpca", - "pca_salmon_{molecule}"), - molecule=["genes", "transcripts"]), - - zpca_kallisto = expand(os.path.join( - config["output_dir"], - "zpca", - "pca_kallisto_{molecule}"), - molecule=["genes", "transcripts"] + "ALFA", + "{renamed_unique}", + "{sample}.ALFA_feature_counts.tsv", + ), + sample=pd.unique(samples_table.index.values), + renamed_unique=alfa_dict.keys(), ), - - multiqc_config = os.path.join( - config["output_dir"], - "multiqc_config.yaml") - + zpca_salmon=expand( + os.path.join(config["output_dir"], "zpca", "pca_salmon_{molecule}"), + molecule=["genes", "transcripts"], + ), + zpca_kallisto=expand( + os.path.join(config["output_dir"], "zpca", "pca_kallisto_{molecule}"), + molecule=["genes", "transcripts"], + ), + multiqc_config=os.path.join(config["output_dir"], "multiqc_config.yaml"), output: - multiqc_report = directory( - os.path.join( - config["output_dir"], - "multiqc_summary")) - + multiqc_report=directory(os.path.join(config["output_dir"], "multiqc_summary")), params: - cluster_log_path = config["cluster_log_dir"], - results_dir = os.path.join( - config["output_dir"]), - log_dir = config["log_dir"], - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + results_dir=lambda wildcards, output: os.path.split(output.multiqc_report)[0], + log_dir=config["log_dir"], + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--outdir', - '--config', - ) - ) - - singularity: + "--outdir", + "--config", + ), + ), + container: "docker://quay.io/biocontainers/zavolan-multiqc-plugins:1.3--pyh5e36f6f_0" - conda: os.path.join(workflow.basedir, "envs", "zavolan-multiqc-plugins.yaml") - resources: - mem_mb=lambda wildcards, attempt: 4096 * attempt - + mem_mb=lambda wildcards, attempt: 4096 * attempt, log: - stderr = os.path.join( - config["log_dir"], - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - current_rule + ".stdout.log") - + stderr=os.path.join(config["log_dir"], f"{current_rule}.stderr.log"), + stdout=os.path.join(config["log_dir"], f"{current_rule}.stdout.log"), shell: "(multiqc \ --outdir {output.multiqc_report} \ @@ -2000,55 +1823,52 @@ rule multiqc_report: {params.log_dir};) \ 1> {log.stdout} 2> {log.stderr}" -current_rule = 'sort_bed_4_big' + +current_rule = "sort_bed_4_big" + + rule sort_bed_4_big: - ''' + """ sort bedGraphs in order to work with bedGraphtobigWig - ''' + """ input: - bg = os.path.join( + bg=os.path.join( config["output_dir"], "samples", "{sample}", "ALFA", "{renamed_unique}", - "{sample}.{renamed_unique}.{strand}.bg") - + "{sample}.{renamed_unique}.{strand}.bg", + ), output: - sorted_bg = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "bigWig", - "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.sorted.bg")) - - params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=( - '-i', - ) + sorted_bg=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "bigWig", + "{renamed_unique}", + "{sample}_{renamed_unique}_{strand}.sorted.bg", ) - - singularity: + ), + params: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=("-i",) + ), + container: "docker://quay.io/biocontainers/bedtools:2.27.1--h9a82719_5" - conda: os.path.join(workflow.basedir, "envs", "bedtools.yaml") - resources: - mem_mb=lambda wildcards, attempt: 2048 * attempt - + mem_mb=lambda wildcards, attempt: 2048 * attempt, log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{renamed_unique}_{strand}.stderr.log") - + f"{current_rule}_{{renamed_unique}}_{{strand}}.stderr.log", + ), shell: "(sortBed \ -i {input.bg} \ @@ -2056,72 +1876,62 @@ rule sort_bed_4_big: > {output.sorted_bg};) 2> {log.stderr}" -current_rule = 'prepare_bigWig' +current_rule = "prepare_bigWig" + + rule prepare_bigWig: - ''' + """ bedGraphtobigWig, for viewing in genome browsers - ''' + """ input: - sorted_bg = os.path.join( + sorted_bg=os.path.join( config["output_dir"], "samples", "{sample}", "bigWig", "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.sorted.bg"), - chr_sizes = lambda wildcards: - os.path.join( - config['star_indexes'], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - get_sample( - 'index_size', - search_id='index', - search_value=wildcards.sample), - "STAR_index", - "chrNameLength.txt") - + "{sample}_{renamed_unique}_{strand}.sorted.bg", + ), + chr_sizes=lambda wildcards: os.path.join( + config["star_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("index_size", search_id="index", search_value=wildcards.sample), + "STAR_index", + "chrNameLength.txt", + ), output: - bigWig = os.path.join( + bigWig=os.path.join( config["output_dir"], "samples", "{sample}", "bigWig", "{renamed_unique}", - "{sample}_{renamed_unique}_{strand}.bw") - + "{sample}_{renamed_unique}_{strand}.bw", + ), params: - cluster_log_path = config["cluster_log_dir"], - additional_params = parse_rule_config( - rule_config, - current_rule=current_rule, - immutable=() - ) - - singularity: + cluster_log_path=config["cluster_log_dir"], + additional_params=parse_rule_config( + rule_config, current_rule=current_rule, immutable=() + ), + container: "docker://quay.io/biocontainers/ucsc-bedgraphtobigwig:377--h0b8a92a_2" - conda: os.path.join(workflow.basedir, "envs", "ucsc-bedgraphtobigwig.yaml") - resources: - mem_mb=lambda wildcards, attempt: 1024 * attempt - + mem_mb=lambda wildcards, attempt: 1024 * attempt, log: - stderr = os.path.join( + stderr=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{renamed_unique}_{strand}.stderr.log"), - - stdout = os.path.join( + f"{current_rule}_{{renamed_unique}}_{{strand}}.stderr.log", + ), + stdout=os.path.join( config["log_dir"], "samples", "{sample}", - current_rule + "_{renamed_unique}_{strand}.stdout.log") - + f"{current_rule}_{{renamed_unique}}_{{strand}}.stdout.log", + ), shell: "(bedGraphToBigWig \ {params.additional_params} \ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk new file mode 100644 index 0000000..9ded097 --- /dev/null +++ b/workflow/rules/common.smk @@ -0,0 +1,32 @@ +## Function definitions +def get_sample(column_id, search_id=None, search_value=None): + """Get relevant per sample information from samples table""" + if search_id: + if search_id == "index": + return str(samples_table[column_id][samples_table.index == search_value][0]) + else: + return str( + samples_table[column_id][samples_table[search_id] == search_value][0] + ) + else: + return str(samples_table[column_id][0]) + + +def get_directionality(libtype, tool): + """Get directionality value for different tools""" + directionality = "" + + for key in directionality_dict.keys(): + # Use the first of 'SF' or 'SR' that is found in libtype to look up directionality params for the current tool + if key in libtype: + directionality = directionality_dict[key][tool] + break + + # If libtype contains neither 'SF', nor 'SR' we don't know what to do + if not directionality: + raise ValueError( + f"Unknown libtype {libtype}.\n" + "Provide one of 'SF', 'SR', 'ISF', 'ISR', 'OSF', 'OSR', 'MSF', 'MSR' in samples.tsv!" + ) + + return directionality diff --git a/workflow/rules/htsinfer.smk b/workflow/rules/htsinfer.smk index 2b4c85f..e9e003f 100644 --- a/workflow/rules/htsinfer.smk +++ b/workflow/rules/htsinfer.smk @@ -8,46 +8,50 @@ config.setdefault("records", 100000) # global variables -samples = pd.read_csv(config["samples"], header=0, index_col=0, sep = "\t") +samples = pd.read_csv(config["samples"], header=0, index_col=0, sep="\t") OUT_DIR = config["outdir"] -LOG_DIR = os.path.join(OUT_DIR,"logs") -CLUSTER_LOG = os.path.join(LOG_DIR,"cluster_logs") +LOG_DIR = os.path.join(OUT_DIR, "logs") +CLUSTER_LOG = os.path.join(LOG_DIR, "cluster_logs") # Write inferred params into new sample table. -SAMPLES_OUT = os.path.join(OUT_DIR,config["samples_out"]) +SAMPLES_OUT = os.path.join(OUT_DIR, config["samples_out"]) +localrules: + all, + htsinfer_to_tsv, -localrules: all, htsinfer_to_tsv rule all: input: - SAMPLES_OUT + SAMPLES_OUT, + current_rule = "run_htsinfer" + + rule run_htsinfer: - ''' Run htsinfer on fastq samples - ''' + """ Run htsinfer on fastq samples + """ input: - fq1_path = lambda wildcards: - samples.loc[wildcards.sample, "fq1"] + fq1_path=lambda wildcards: samples.loc[wildcards.sample, "fq1"], output: - htsinfer_json = os.path.join(OUT_DIR, "htsinfer_{sample}.json") + htsinfer_json=os.path.join(OUT_DIR, "htsinfer_{sample}.json"), params: - fq2_path = lambda wildcards: - (samples.loc[wildcards.sample,"fq2"] if samples.loc[wildcards.sample,"fq2"] else ""), - records = config["records"], - outdir = OUT_DIR, - cluster_log_path = CLUSTER_LOG + fq2_path=lambda wildcards: ( + samples.loc[wildcards.sample, "fq2"] + if samples.loc[wildcards.sample, "fq2"] + else "" + ), + records=config["records"], + outdir=OUT_DIR, + cluster_log_path=CLUSTER_LOG, threads: 4 - singularity: + singularity: "docker://zavolab/htsinfer:0.9.0" - conda: + conda: os.path.join(workflow.basedir, "..", "envs", "htsinfer.yaml") log: - stderr = os.path.join( - LOG_DIR, - "{sample}", - current_rule + ".stderr.log") + stderr=os.path.join(LOG_DIR, "{sample}", current_rule + ".stderr.log"), shell: """ set +e @@ -59,36 +63,41 @@ rule run_htsinfer: fi """ + current_rule = "htsinfer_to_tsv" + + rule htsinfer_to_tsv: - '''Write inferred params for all samples to samples.tsv''' + """Write inferred params for all samples to samples.tsv""" input: - jlist = expand(os.path.join(OUT_DIR, - "htsinfer_{sample}.json"), - sample = samples.index.tolist()), - samples_in = config["samples"], - script = os.path.join(workflow.basedir,"..","scripts","htsinfer_to_tsv.py") + jlist=expand( + os.path.join(OUT_DIR, "htsinfer_{sample}.json"), + sample=samples.index.tolist(), + ), + samples_in=config["samples"], + script=os.path.join(workflow.basedir, "..", "scripts", "htsinfer_to_tsv.py"), output: - SAMPLES_OUT + SAMPLES_OUT, threads: 4 - singularity: + singularity: "docker://zavolab/htsinfer:0.9.0" - conda: + conda: os.path.join(workflow.basedir, "..", "envs", "htsinfer.yaml") log: - stderr = os.path.join( - LOG_DIR, - current_rule + ".stderr.log") + stderr=os.path.join(LOG_DIR, current_rule + ".stderr.log"), shell: - ''' + """ python {input.script} \ -f {input.jlist} \ -s {input.samples_in} \ -o {output} \ 2> {log.stderr} - ''' - + """ + + onsuccess: print("Workflow finished, no error.") + + onerror: print("Ooops... something went wrong") diff --git a/workflow/rules/paired_end.snakefile.smk b/workflow/rules/paired_end.snakefile.smk index ed78c2a..f5d65ef 100644 --- a/workflow/rules/paired_end.snakefile.smk +++ b/workflow/rules/paired_end.snakefile.smk @@ -1,81 +1,82 @@ -current_rule = 'pe_remove_adapters_cutadapt' +current_rule = "pe_remove_adapters_cutadapt" + + rule pe_remove_adapters_cutadapt: - ''' + """ Remove adapters - ''' + """ input: - reads1 = os.path.join( + reads1=os.path.join( config["output_dir"], "samples", "{sample}", "start", - "{sample}.fq1.fastq.gz"), - - reads2 = os.path.join( + "{sample}.fq1.fastq.gz", + ), + reads2=os.path.join( config["output_dir"], "samples", "{sample}", "start", - "{sample}.fq2.fastq.gz"), - + "{sample}.fq2.fastq.gz", + ), output: - reads1 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.fq1.pe.remove_adapters.fastq.gz")), - reads2 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.fq2.pe.remove_adapters.fastq.gz")) - + reads1=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.fq1.pe.remove_adapters.fastq.gz", + ) + ), + reads2=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.fq2.pe.remove_adapters.fastq.gz", + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - adapter_3_mate1 = lambda wildcards: - get_sample('fq1_3p', search_id='index', search_value=wildcards.sample), - adapter_5_mate1 = lambda wildcards: - get_sample('fq1_5p', search_id='index', search_value=wildcards.sample), - adapter_3_mate2 = lambda wildcards: - get_sample('fq2_3p', search_id='index', search_value=wildcards.sample), - adapter_5_mate2 = lambda wildcards: - get_sample('fq2_5p', search_id='index', search_value=wildcards.sample), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + adapter_3_mate1=lambda wildcards: get_sample( + "fq1_3p", search_id="index", search_value=wildcards.sample + ), + adapter_5_mate1=lambda wildcards: get_sample( + "fq1_5p", search_id="index", search_value=wildcards.sample + ), + adapter_3_mate2=lambda wildcards: get_sample( + "fq2_3p", search_id="index", search_value=wildcards.sample + ), + adapter_5_mate2=lambda wildcards: get_sample( + "fq2_5p", search_id="index", search_value=wildcards.sample + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-a', - '-A', - '-g', - '-G', - '-o', - '-p', - ) - ) - - singularity: + "-a", + "-A", + "-g", + "-G", + "-o", + "-p", + ), + ), + container: "docker://quay.io/biocontainers/cutadapt:3.4--py37h73a75cf_1" - conda: os.path.join(workflow.basedir, "envs", "cutadapt.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 5000 * attempt - + mem_mb=lambda wildcards, attempt: 5000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stdout.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stdout.log" + ), shell: "(cutadapt \ -j {threads} \ @@ -92,93 +93,83 @@ rule pe_remove_adapters_cutadapt: 1> {log.stdout} 2>{log.stderr}" -current_rule = 'pe_remove_polya_cutadapt' +current_rule = "pe_remove_polya_cutadapt" + + rule pe_remove_polya_cutadapt: - ''' + """ Remove polyA tails - ''' + """ input: - reads1 = os.path.join( + reads1=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.pe.remove_adapters.fastq.gz"), - reads2 = os.path.join( + "{sample}.fq1.pe.remove_adapters.fastq.gz", + ), + reads2=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq2.pe.remove_adapters.fastq.gz") - + "{sample}.fq2.pe.remove_adapters.fastq.gz", + ), output: - reads1 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.fq1.pe.remove_polya.fastq.gz")), - reads2 = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.fq2.pe.remove_polya.fastq.gz")) - + reads1=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.fq1.pe.remove_polya.fastq.gz", + ) + ), + reads2=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.fq2.pe.remove_polya.fastq.gz", + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - polya_3_mate1 = lambda wildcards: - get_sample( - 'fq1_polya_3p', - search_id='index', - search_value=wildcards.sample), - polya_5_mate1 = lambda wildcards: - get_sample( - 'fq1_polya_5p', - search_id='index', - search_value=wildcards.sample), - polya_3_mate2 = lambda wildcards: - get_sample( - 'fq2_polya_3p', - search_id='index', - search_value=wildcards.sample), - polya_5_mate2 = lambda wildcards: - get_sample( - 'fq2_polya_5p', - search_id='index', - search_value=wildcards.sample), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + polya_3_mate1=lambda wildcards: get_sample( + "fq1_polya_3p", search_id="index", search_value=wildcards.sample + ), + polya_5_mate1=lambda wildcards: get_sample( + "fq1_polya_5p", search_id="index", search_value=wildcards.sample + ), + polya_3_mate2=lambda wildcards: get_sample( + "fq2_polya_3p", search_id="index", search_value=wildcards.sample + ), + polya_5_mate2=lambda wildcards: get_sample( + "fq2_polya_5p", search_id="index", search_value=wildcards.sample + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-a', - '-A', - '-g', - '-G', - '-o', - '-p', - ) - ) - - singularity: + "-a", + "-A", + "-g", + "-G", + "-o", + "-p", + ), + ), + container: "docker://quay.io/biocontainers/cutadapt:3.4--py37h73a75cf_1" - conda: os.path.join(workflow.basedir, "envs", "cutadapt.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 5000 * attempt - + mem_mb=lambda wildcards, attempt: 5000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stdout.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stdout.log" + ), shell: "(cutadapt \ -j {threads} \ @@ -195,106 +186,93 @@ rule pe_remove_polya_cutadapt: 1> {log.stdout} 2>{log.stderr}" -current_rule = 'pe_map_genome_star' +current_rule = "pe_map_genome_star" + + rule pe_map_genome_star: - ''' + """ Map to genome using STAR - ''' + """ input: - index = lambda wildcards: - os.path.join( - config["star_indexes"], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - get_sample( - 'index_size', - search_id='index', - search_value=wildcards.sample), - "STAR_index", - "chrNameLength.txt"), - reads1 = os.path.join( + index=lambda wildcards: os.path.join( + config["star_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("index_size", search_id="index", search_value=wildcards.sample), + "STAR_index", + "chrNameLength.txt", + ), + reads1=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.pe.remove_polya.fastq.gz"), - reads2 = os.path.join( + "{sample}.fq1.pe.remove_polya.fastq.gz", + ), + reads2=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq2.pe.remove_polya.fastq.gz") - + "{sample}.fq2.pe.remove_polya.fastq.gz", + ), output: - bam = os.path.join( + bam=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.pe.Aligned.out.bam"), - logfile = os.path.join( + "{sample}.pe.Aligned.out.bam", + ), + logfile=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.pe.Log.final.out") - - shadow: "minimal" - + "{sample}.pe.Log.final.out", + ), + shadow: + "minimal" params: - cluster_log_path = config["cluster_log_dir"], - sample_id = "{sample}", - index = lambda wildcards: - os.path.abspath(os.path.join( + cluster_log_path=config["cluster_log_dir"], + sample_id="{sample}", + index=lambda wildcards: os.path.abspath( + os.path.join( config["star_indexes"], get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), + "organism", search_id="index", search_value=wildcards.sample + ), get_sample( - 'index_size', - search_id='index', - search_value=wildcards.sample), - "STAR_index")), - outFileNamePrefix = os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.pe."), - additional_params = parse_rule_config( + "index_size", search_id="index", search_value=wildcards.sample + ), + "STAR_index", + ) + ), + outFileNamePrefix=lambda wildcards, output: output.bam.replace( + "Aligned.out.bam", "" + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--genomeDir', - '--readFilesIn', - '--readFilesCommand', - '--outFileNamePrefix', - '--outSAMattributes', - '--outStd', - '--outSAMtype', - '--outSAMattrRGline', - ) - ) - - singularity: + "--genomeDir", + "--readFilesIn", + "--readFilesCommand", + "--outFileNamePrefix", + "--outSAMattributes", + "--outStd", + "--outSAMtype", + "--outSAMattrRGline", + ), + ), + container: "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" - conda: os.path.join(workflow.basedir, "envs", "STAR.yaml") - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt - + mem_mb=lambda wildcards, attempt: 32000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stderr.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log" + ), shell: "(STAR \ --runThreadN {threads} \ @@ -311,120 +289,102 @@ rule pe_map_genome_star: 2> {log.stderr}" -current_rule = 'pe_quantification_salmon' +current_rule = "pe_quantification_salmon" + + rule pe_quantification_salmon: - ''' + """ Quantification at transcript and gene level using Salmon - ''' + """ input: - reads1 = os.path.join( + reads1=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.pe.remove_polya.fastq.gz"), - reads2 = os.path.join( + "{sample}.fq1.pe.remove_polya.fastq.gz", + ), + reads2=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq2.pe.remove_polya.fastq.gz"), - gtf = lambda wildcards: - os.path.abspath(get_sample( - 'gtf', - search_id='index', - search_value=wildcards.sample)), - index = lambda wildcards: - os.path.join( - config["salmon_indexes"], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - get_sample( - 'kmer', - search_id='index', - search_value=wildcards.sample), - "salmon.idx") - + "{sample}.fq2.pe.remove_polya.fastq.gz", + ), + gtf=lambda wildcards: os.path.abspath( + get_sample("gtf", search_id="index", search_value=wildcards.sample) + ), + index=lambda wildcards: os.path.join( + config["salmon_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("kmer", search_id="index", search_value=wildcards.sample), + "salmon.idx", + ), output: - gn_estimates = os.path.join( + gn_estimates=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.pe", - "quant.genes.sf"), - tr_estimates = os.path.join( + "quant.genes.sf", + ), + tr_estimates=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.pe", - "quant.sf"), - meta_info = os.path.join( + "quant.sf", + ), + meta_info=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.pe", "aux_info", - "meta_info.json"), - flenDist = os.path.join( + "meta_info.json", + ), + flenDist=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.pe", "libParams", - "flenDist.txt") - - shadow: "minimal" - + "flenDist.txt", + ), + shadow: + "minimal" params: - cluster_log_path = config["cluster_log_dir"], - output_dir = os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.salmon.pe"), - libType = lambda wildcards: - get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.tr_estimates), + libType=lambda wildcards: get_sample( + "libtype", search_id="index", search_value=wildcards.sample + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--libType', - '--fldMean', - '--fldSD', - '--index', - '--geneMap', - '-1', - '-2', - '-o', - ) - ) - - singularity: + "--libType", + "--fldMean", + "--fldSD", + "--index", + "--geneMap", + "-1", + "-2", + "-o", + ), + ), + container: "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 6 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt - + mem_mb=lambda wildcards, attempt: 32000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stdout.log"), - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stdout.log" + ), shell: "(salmon quant \ --libType {params.libType} \ @@ -438,93 +398,80 @@ rule pe_quantification_salmon: ) 1> {log.stdout} 2> {log.stderr}" -current_rule = 'pe_genome_quantification_kallisto' +current_rule = "pe_genome_quantification_kallisto" + + rule pe_genome_quantification_kallisto: - ''' + """ Quantification at transcript and gene level using Kallisto - ''' + """ input: - reads1 = os.path.join( + reads1=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.pe.remove_polya.fastq.gz"), - reads2 = os.path.join( + "{sample}.fq1.pe.remove_polya.fastq.gz", + ), + reads2=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq2.pe.remove_polya.fastq.gz"), - index = lambda wildcards: - os.path.join( - config["kallisto_indexes"], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - "kallisto.idx") - + "{sample}.fq2.pe.remove_polya.fastq.gz", + ), + index=lambda wildcards: os.path.join( + config["kallisto_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + "kallisto.idx", + ), output: - pseudoalignment = os.path.join( + pseudoalignment=os.path.join( config["output_dir"], "samples", "{sample}", "quant_kallisto", - "{sample}.pe.kallisto.pseudo.sam"), - abundances = os.path.join( + "{sample}.pe.kallisto.pseudo.sam", + ), + abundances=os.path.join( config["output_dir"], "samples", "{sample}", "quant_kallisto", - "abundance.h5") - - shadow: "minimal" - + "abundance.h5", + ), + shadow: + "minimal" params: - cluster_log_path = config["cluster_log_dir"], - output_dir = os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto"), - directionality = lambda wildcards: - get_directionality(get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample),"kallisto"), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.pseudoalignment), + directionality=lambda wildcards: get_directionality( + get_sample("libtype", search_id="index", search_value=wildcards.sample), + "kallisto", + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--single', - '-i', - '-o', - '-l', - '-s', - '--pseudobam', - '--fr-stranded', - '--rf-stranded', - ) - ) - - - singularity: + "--single", + "-i", + "-o", + "-l", + "-s", + "--pseudobam", + "--fr-stranded", + "--rf-stranded", + ), + ), + container: "docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2" - conda: os.path.join(workflow.basedir, "envs", "kallisto.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt: 6000 * attempt - + mem_mb=lambda wildcards, attempt: 6000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".stderr.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".stderr.log" + ), shell: "(kallisto quant \ -i {input.index} \ @@ -535,4 +482,3 @@ rule pe_genome_quantification_kallisto: --pseudobam \ {input.reads1} {input.reads2} > {output.pseudoalignment}) \ 2> {log.stderr}" - diff --git a/workflow/rules/single_end.snakefile.smk b/workflow/rules/single_end.snakefile.smk index 46fd52a..0c651df 100644 --- a/workflow/rules/single_end.snakefile.smk +++ b/workflow/rules/single_end.snakefile.smk @@ -1,71 +1,61 @@ -current_rule = 'remove_adapters_cutadapt' +current_rule = "remove_adapters_cutadapt" + + rule remove_adapters_cutadapt: - ''' + """ Remove adapters - ''' + """ input: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", "start", - "{sample}.fq1.fastq.gz") - + "{sample}.fq1.fastq.gz", + ), output: - reads = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.fq1.se.remove_adapters.fastq.gz")) - + reads=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.fq1.se.remove_adapters.fastq.gz", + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - adapters_3 = lambda wildcards: - get_sample( - 'fq1_3p', - search_id='index', - search_value=wildcards.sample), - adapters_5 = lambda wildcards: - get_sample( - 'fq1_5p', - search_id='index', - search_value=wildcards.sample), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + adapters_3=lambda wildcards: get_sample( + "fq1_3p", search_id="index", search_value=wildcards.sample + ), + adapters_5=lambda wildcards: get_sample( + "fq1_5p", search_id="index", search_value=wildcards.sample + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-a', - '-A', - '-g', - '-G', - '-o', - '-p', - ) - ) - - singularity: + "-a", + "-A", + "-g", + "-G", + "-o", + "-p", + ), + ), + container: "docker://quay.io/biocontainers/cutadapt:3.4--py37h73a75cf_1" - conda: os.path.join(workflow.basedir, "envs", "cutadapt.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 5000 * attempt - + mem_mb=lambda wildcards, attempt: 5000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stdout.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stdout.log" + ), shell: "(cutadapt \ -j {threads} \ @@ -78,73 +68,63 @@ rule remove_adapters_cutadapt: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'remove_polya_cutadapt' +current_rule = "remove_polya_cutadapt" + + rule remove_polya_cutadapt: - ''' + """ Remove ployA tails - ''' + """ input: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.se.remove_adapters.fastq.gz") - + "{sample}.fq1.se.remove_adapters.fastq.gz", + ), output: - reads = temp(os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.fq1.se.remove_polya.fastq.gz")) - + reads=temp( + os.path.join( + config["output_dir"], + "samples", + "{sample}", + "{sample}.fq1.se.remove_polya.fastq.gz", + ) + ), params: - cluster_log_path = config["cluster_log_dir"], - polya_3 = lambda wildcards: - get_sample( - 'fq1_polya_3p', - search_id='index', - search_value=wildcards.sample), - polya_5 = lambda wildcards: - get_sample( - 'fq1_polya_5p', - search_id='index', - search_value=wildcards.sample), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + polya_3=lambda wildcards: get_sample( + "fq1_polya_3p", search_id="index", search_value=wildcards.sample + ), + polya_5=lambda wildcards: get_sample( + "fq1_polya_5p", search_id="index", search_value=wildcards.sample + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '-a', - '-A', - '-g', - '-G', - '-o', - '-p', - ) - ) - - singularity: + "-a", + "-A", + "-g", + "-G", + "-o", + "-p", + ), + ), + container: "docker://quay.io/biocontainers/cutadapt:3.4--py37h73a75cf_1" - conda: os.path.join(workflow.basedir, "envs", "cutadapt.yaml") - threads: 4 - resources: - mem_mb=lambda wildcards, attempt: 5000 * attempt - + mem_mb=lambda wildcards, attempt: 5000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stdout.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stdout.log" + ), shell: "(cutadapt \ -j {threads} \ @@ -157,89 +137,87 @@ rule remove_polya_cutadapt: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'map_genome_star' +current_rule = "map_genome_star" + + rule map_genome_star: - ''' + """ Map to genome using STAR - ''' + """ input: - index = lambda wildcards: - os.path.join( - config["star_indexes"], - get_sample('organism', search_id='index', search_value=wildcards.sample), - get_sample('index_size', search_id='index', search_value=wildcards.sample), - "STAR_index", - "chrNameLength.txt"), - reads = os.path.join( + index=lambda wildcards: os.path.join( + config["star_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("index_size", search_id="index", search_value=wildcards.sample), + "STAR_index", + "chrNameLength.txt", + ), + reads=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.se.remove_polya.fastq.gz") - + "{sample}.fq1.se.remove_polya.fastq.gz", + ), output: - bam = os.path.join( + bam=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.se.Aligned.out.bam"), - logfile = os.path.join( + "{sample}.se.Aligned.out.bam", + ), + logfile=os.path.join( config["output_dir"], "samples", "{sample}", "map_genome", - "{sample}.se.Log.final.out") - - shadow: "minimal" - + "{sample}.se.Log.final.out", + ), + shadow: + "minimal" params: - cluster_log_path = config["cluster_log_dir"], - sample_id = "{sample}", - index = lambda wildcards: - os.path.abspath(os.path.join( + cluster_log_path=config["cluster_log_dir"], + sample_id="{sample}", + index=lambda wildcards: os.path.abspath( + os.path.join( config["star_indexes"], - get_sample('organism', search_id='index', search_value=wildcards.sample), - get_sample('index_size', search_id='index', search_value=wildcards.sample), - "STAR_index")), - outFileNamePrefix = os.path.join( - config["output_dir"], - "samples", - "{sample}", - "map_genome", - "{sample}.se."), - additional_params = parse_rule_config( + get_sample( + "organism", search_id="index", search_value=wildcards.sample + ), + get_sample( + "index_size", search_id="index", search_value=wildcards.sample + ), + "STAR_index", + ) + ), + outFileNamePrefix=lambda wildcards, output: output.bam.replace( + "Aligned.out.bam", "" + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--genomeDir', - '--readFilesIn', - '--readFilesCommand', - '--outFileNamePrefix', - '--outSAMattributes', - '--outStd', - '--outSAMtype', - '--outSAMattrRGline', - ) - ) - - singularity: + "--genomeDir", + "--readFilesIn", + "--readFilesCommand", + "--outFileNamePrefix", + "--outSAMattributes", + "--outStd", + "--outSAMtype", + "--outSAMattrRGline", + ), + ), + container: "docker://quay.io/biocontainers/star:2.7.8a--h9ee0642_1" - conda: os.path.join(workflow.basedir, "envs", "STAR.yaml") - threads: 12 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt - + mem_mb=lambda wildcards, attempt: 32000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stderr.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stderr.log" + ), shell: "(STAR \ --runThreadN {threads} \ @@ -256,124 +234,101 @@ rule map_genome_star: 2> {log.stderr}" -current_rule = 'quantification_salmon' +current_rule = "quantification_salmon" + + rule quantification_salmon: - ''' + """ Quantification at transcript and gene level using Salmon - ''' + """ input: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.se.remove_polya.fastq.gz"), - index = lambda wildcards: - os.path.join( - config["salmon_indexes"], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - get_sample( - 'kmer', - search_id='index', - search_value=wildcards.sample), - "salmon.idx"), - gtf = lambda wildcards: - os.path.abspath(get_sample( - 'gtf', - search_id='index', - search_value=wildcards.sample)) - + "{sample}.fq1.se.remove_polya.fastq.gz", + ), + index=lambda wildcards: os.path.join( + config["salmon_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + get_sample("kmer", search_id="index", search_value=wildcards.sample), + "salmon.idx", + ), + gtf=lambda wildcards: os.path.abspath( + get_sample("gtf", search_id="index", search_value=wildcards.sample) + ), output: - gn_estimates = os.path.join( + gn_estimates=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.se", - "quant.genes.sf"), - tr_estimates = os.path.join( + "quant.genes.sf", + ), + tr_estimates=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.se", - "quant.sf"), - meta_info = os.path.join( + "quant.sf", + ), + meta_info=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.se", "aux_info", - "meta_info.json"), - flenDist = os.path.join( + "meta_info.json", + ), + flenDist=os.path.join( config["output_dir"], "samples", "{sample}", "{sample}.salmon.se", "libParams", - "flenDist.txt") - - shadow: "minimal" - + "flenDist.txt", + ), + shadow: + "minimal" params: - cluster_log_path = config["cluster_log_dir"], - output_dir = os.path.join( - config["output_dir"], - "samples", - "{sample}", - "{sample}.salmon.se"), - libType = lambda wildcards: - get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample), - fraglen = lambda wildcards: - get_sample( - 'mean', - search_id='index', - search_value=wildcards.sample), - fragsd = lambda wildcards: - get_sample( - 'sd', - search_id='index', - search_value=wildcards.sample), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.tr_estimates), + libType=lambda wildcards: get_sample( + "libtype", search_id="index", search_value=wildcards.sample + ), + fraglen=lambda wildcards: get_sample( + "mean", search_id="index", search_value=wildcards.sample + ), + fragsd=lambda wildcards: get_sample( + "sd", search_id="index", search_value=wildcards.sample + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--libType', - '--fldMean', - '--fldSD', - '--index', - '--geneMap', - '--unmatedReads', - '-o', - ) - ) - - singularity: + "--libType", + "--fldMean", + "--fldSD", + "--index", + "--geneMap", + "--unmatedReads", + "-o", + ), + ), + container: "docker://quay.io/biocontainers/salmon:1.4.0--h84f40af_1" - conda: os.path.join(workflow.basedir, "envs", "salmon.yaml") - threads: 6 - resources: - mem_mb=lambda wildcards, attempt: 32000 * attempt - + mem_mb=lambda wildcards, attempt: 32000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stdout.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stdout.log" + ), shell: "(salmon quant \ --libType {params.libType} \ @@ -388,97 +343,80 @@ rule quantification_salmon: 1> {log.stdout} 2> {log.stderr}" -current_rule = 'genome_quantification_kallisto' +current_rule = "genome_quantification_kallisto" + + rule genome_quantification_kallisto: - ''' + """ Quantification at transcript and gene level using Kallisto - ''' + """ input: - reads = os.path.join( + reads=os.path.join( config["output_dir"], "samples", "{sample}", - "{sample}.fq1.se.remove_polya.fastq.gz"), - index = lambda wildcards: - os.path.join( - config["kallisto_indexes"], - get_sample( - 'organism', - search_id='index', - search_value=wildcards.sample), - "kallisto.idx") - + "{sample}.fq1.se.remove_polya.fastq.gz", + ), + index=lambda wildcards: os.path.join( + config["kallisto_indexes"], + get_sample("organism", search_id="index", search_value=wildcards.sample), + "kallisto.idx", + ), output: - pseudoalignment = os.path.join( + pseudoalignment=os.path.join( config["output_dir"], "samples", "{sample}", "quant_kallisto", - "{sample}.se.kallisto.pseudo.sam"), - abundances = os.path.join( + "{sample}.se.kallisto.pseudo.sam", + ), + abundances=os.path.join( config["output_dir"], "samples", "{sample}", "quant_kallisto", - "abundance.h5") - - shadow: "minimal" - + "abundance.h5", + ), + shadow: + "minimal" params: - cluster_log_path = config["cluster_log_dir"], - output_dir = os.path.join( - config["output_dir"], - "samples", - "{sample}", - "quant_kallisto"), - fraglen = lambda wildcards: - get_sample( - 'mean', - search_id='index', - search_value=wildcards.sample), - fragsd = lambda wildcards: - get_sample( - 'sd', - search_id='index', - search_value=wildcards.sample), - directionality = lambda wildcards: - get_directionality(get_sample( - 'libtype', - search_id='index', - search_value=wildcards.sample),"kallisto"), - additional_params = parse_rule_config( + cluster_log_path=config["cluster_log_dir"], + output_dir=lambda wildcards, output: os.path.dirname(output.pseudoalignment), + fraglen=lambda wildcards: get_sample( + "mean", search_id="index", search_value=wildcards.sample + ), + fragsd=lambda wildcards: get_sample( + "sd", search_id="index", search_value=wildcards.sample + ), + directionality=lambda wildcards: get_directionality( + get_sample("libtype", search_id="index", search_value=wildcards.sample), + "kallisto", + ), + additional_params=parse_rule_config( rule_config, current_rule=current_rule, immutable=( - '--single', - '-i', - '-o', - '-l', - '-s', - '--pseudobam', - '--fr-stranded', - '--rf-stranded', - ) - ) - - singularity: + "--single", + "-i", + "-o", + "-l", + "-s", + "--pseudobam", + "--fr-stranded", + "--rf-stranded", + ), + ), + container: "docker://quay.io/biocontainers/kallisto:0.46.2--h60f4f9f_2" - conda: os.path.join(workflow.basedir, "envs", "kallisto.yaml") - threads: 8 - resources: - mem_mb=lambda wildcards, attempt: 6000 * attempt - + mem_mb=lambda wildcards, attempt: 6000 * attempt, log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - current_rule + ".se.stderr.log") - + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", current_rule + ".se.stderr.log" + ), shell: "(kallisto quant \ --single \ @@ -492,4 +430,3 @@ rule genome_quantification_kallisto: --pseudobam \ {input.reads} > {output.pseudoalignment};) \ 2> {log.stderr}" - diff --git a/workflow/rules/sra_download.smk b/workflow/rules/sra_download.smk index c4d3133..358f536 100644 --- a/workflow/rules/sra_download.smk +++ b/workflow/rules/sra_download.smk @@ -2,41 +2,42 @@ import pandas as pd -samples = pd.read_csv(config["samples"], header=0, index_col=0, sep = "\t") +samples = pd.read_csv(config["samples"], header=0, index_col=0, sep="\t") DOWNLOAD_DIR = config["outdir"] # Write fastq.gz location into new sample table. SAMPLES_OUT = config["samples_out"] samples_mod = samples.copy() -localrules: prefetch, add_fq_file_path, all + +localrules: + prefetch, + add_fq_file_path, + all, + rule all: "Target rule." input: - SAMPLES_OUT + SAMPLES_OUT, rule prefetch: "Prefetch SRA entry. Requires internet access." output: - os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.sra") + os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.sra"), params: - outdir = DOWNLOAD_DIR + outdir=DOWNLOAD_DIR, conda: os.path.join(workflow.basedir, "..", "envs", "sra-tools.yaml") singularity: "docker://ncbi/sra-tools" log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - "prefetch.stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - "prefetch.stdout.log") + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", "prefetch.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", "prefetch.stdout.log" + ), shell: """ prefetch {wildcards.sample} \ @@ -44,33 +45,30 @@ rule prefetch: 1> {log.stdout} 2> {log.stderr} """ + rule fasterq_dump: "Dump SRA entry as fastq file(s)." input: - os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.sra") + os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.sra"), output: - os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.dumped") + os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.dumped"), params: - outdir = lambda wildcards: os.path.join(DOWNLOAD_DIR, wildcards.sample), - cluster_log_path = config["cluster_log_dir"] + outdir=lambda wildcards: os.path.join(DOWNLOAD_DIR, wildcards.sample), + cluster_log_path=config["cluster_log_dir"], resources: - mem_mb = lambda wildcards, attempt: 2048 * attempt + mem_mb=lambda wildcards, attempt: 2048 * attempt, threads: 4 conda: os.path.join(workflow.basedir, "..", "envs", "sra-tools.yaml") singularity: "docker://ncbi/sra-tools" log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - "fasterq_dump.stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - "fasterq_dump.stdout.log") + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", "fasterq_dump.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", "fasterq_dump.stdout.log" + ), shell: """ fasterq-dump {params.outdir} --outdir {params.outdir} \ @@ -86,7 +84,7 @@ def get_fastq_files(wildcards): Args: wildcards: Snakemake wildcards, contains name "sample". - + Returns: list (str): paths to .fastq files. """ @@ -95,34 +93,30 @@ def get_fastq_files(wildcards): for f in files: if f.endswith(".fastq"): to_zip.append(os.path.join(DOWNLOAD_DIR, wildcards.sample, f)) - return(to_zip) + return to_zip rule compress_fastq: "Compress fastq inplace with pigz at best (9) compression level." input: - tmpf = os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.dumped") + tmpf=os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.dumped"), output: - os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.processed") + os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.processed"), params: - files = get_fastq_files, - cluster_log_path = config["cluster_log_dir"] + files=get_fastq_files, + cluster_log_path=config["cluster_log_dir"], threads: 6 conda: os.path.join(workflow.basedir, "..", "envs", "pigz.yaml") singularity: "docker://bytesco/pigz" log: - stderr = os.path.join( - config["log_dir"], - "samples", - "{sample}", - "compress_fastq.stderr.log"), - stdout = os.path.join( - config["log_dir"], - "samples", - "{sample}", - "compress_fastq.stdout.log") + stderr=os.path.join( + config["log_dir"], "samples", "{sample}", "compress_fastq.stderr.log" + ), + stdout=os.path.join( + config["log_dir"], "samples", "{sample}", "compress_fastq.stdout.log" + ), shell: """ pigz --best --processes {threads} {params.files} \ @@ -130,14 +124,16 @@ rule compress_fastq: touch {output} """ + rule add_fq_file_path: "Add fastq paths to sample table." input: - expand(os.path.join(DOWNLOAD_DIR, - "{sample}", "{sample}.processed"), - sample = samples[samples.index.str.contains("SRR")].index.tolist()) + expand( + os.path.join(DOWNLOAD_DIR, "{sample}", "{sample}.processed"), + sample=samples[samples.index.str.contains("SRR")].index.tolist(), + ), output: - SAMPLES_OUT + SAMPLES_OUT, run: for sample in input: files = os.listdir(os.path.dirname(sample)) @@ -153,4 +149,4 @@ rule add_fq_file_path: gzs.sort() samples_mod.loc[sample_name, "fq1"] = gzs[0] samples_mod.loc[sample_name, "fq2"] = gzs[1] - samples_mod.to_csv(SAMPLES_OUT, index=True, sep = "\t") + samples_mod.to_csv(SAMPLES_OUT, index=True, sep="\t")