diff --git a/reneo/__main__.py b/reneo/__main__.py index 8c63e48..ed0ea82 100644 --- a/reneo/__main__.py +++ b/reneo/__main__.py @@ -110,6 +110,7 @@ def cli(): all Run everything (default) preprocess Run preprocessing only reneo Run reneo (and preprocessing if needed) + postprocess Run postprocessing (with preprocessing and reneo if needed) print_targets List available targets """ diff --git a/reneo/workflow/reneo.smk b/reneo/workflow/reneo.smk index d4b5e33..3685a48 100644 --- a/reneo/workflow/reneo.smk +++ b/reneo/workflow/reneo.smk @@ -28,7 +28,7 @@ def targetRule(fn): target_rules.append(fn.__name__[2:]) return fn -localrules: all, preprocess, reneo, koverage_tsv, print_stages +localrules: all, preprocess, reneo, koverage_tsv, postprocess, print_stages """Run stages""" @@ -36,7 +36,8 @@ localrules: all, preprocess, reneo, koverage_tsv, print_stages rule all: input: preprocessTargets, - reneoTargets + reneoTargets, + postprocessTargets @targetRule @@ -51,6 +52,12 @@ rule reneo: reneoTargets +@targetRule +rule postprocess: + input: + postprocessTargets + + @targetRule rule print_stages: run: @@ -73,3 +80,7 @@ include: os.path.join("rules", "genes.smk") # Step 5: Run Reneo include: os.path.join("rules", "reneo.smk") + + +# Step 6: Postprocess genomes +include: os.path.join("rules", "postprocess.smk") \ No newline at end of file diff --git a/reneo/workflow/rules/02_reneo_targets.smk b/reneo/workflow/rules/02_reneo_targets.smk index b178d3f..764c326 100644 --- a/reneo/workflow/rules/02_reneo_targets.smk +++ b/reneo/workflow/rules/02_reneo_targets.smk @@ -1,6 +1,7 @@ preprocessTargets = [] reneoTargets = [] +postprocessTargets = [] """PREPROCESSING TARGETS""" @@ -37,3 +38,10 @@ reneoTargets.append(RESOLVED_COMP_INFO) COMP_VOGS = os.path.join(OUTDIR, "component_vogs.txt") reneoTargets.append(COMP_VOGS) + +"""POSTPROCESSING TARGETS""" +GENOME_KOVERAGE_RES = os.path.join(OUTDIR, "results", "sample_coverage.tsv") +postprocessTargets.append(GENOME_KOVERAGE_RES) + +GENOME_READ_COUNTS = os.path.join(OUTDIR, "sample_genome_read_counts.tsv") +postprocessTargets.append(GENOME_READ_COUNTS) \ No newline at end of file diff --git a/reneo/workflow/rules/postprocess.smk b/reneo/workflow/rules/postprocess.smk new file mode 100644 index 0000000..e333f7c --- /dev/null +++ b/reneo/workflow/rules/postprocess.smk @@ -0,0 +1,62 @@ +rule combine_genomes_and_unresolved_edges: + """Combine resolved genomes and unresolved edges""" + input: + genomes = RESOLVED_GENOMES, + unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta") + output: + os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta") + shell: + """ + cat {input.genomes} {input.unresolved_edges} > {output} + """ + + +rule koverage_genomes: + """Get coverage statistics with Koverage""" + input: + tsv = os.path.join(OUTDIR,"reneo.samples.tsv"), + sequences = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta") + params: + out_dir = OUTDIR, + profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "", + output: + os.path.join(OUTDIR, "results", "sample_coverage.tsv") + threads: + config["resources"]["jobCPU"] + resources: + mem_mb = config["resources"]["jobMem"], + mem = str(config["resources"]["jobMem"]) + "MB" + conda: + os.path.join("..", "envs", "koverage.yaml") + shell: + """ + koverage run \ + --reads {input.tsv} \ + --ref {input.sequences} \ + --threads {threads} \ + --output {params.out_dir} \ + {params.profile} + """ + + +rule koverage_postprocess: + """Format TSV of samples and reads from Koverage""" + input: + koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"), + samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"), + seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta") + output: + os.path.join(OUTDIR, "sample_genome_read_counts.tsv") + params: + koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"), + samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"), + seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta"), + info_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges_info.tsv"), + output_path = OUTDIR, + log = os.path.join(LOGSDIR, "format_koverage_results_output.log") + log: + os.path.join(LOGSDIR, "format_koverage_results_output.log") + conda: + os.path.join("..", "envs", "reneo.yaml") + script: + os.path.join("..", "scripts", "format_koverage_results.py") \ No newline at end of file diff --git a/reneo/workflow/rules/reneo.smk b/reneo/workflow/rules/reneo.smk index 0736961..e6bc46d 100644 --- a/reneo/workflow/rules/reneo.smk +++ b/reneo/workflow/rules/reneo.smk @@ -12,6 +12,7 @@ rule run_reneo: unitigs = os.path.join(OUTDIR, "resolved_edges.fasta"), component_info = os.path.join(OUTDIR, "resolved_component_info.txt"), vog_comp_info = os.path.join(OUTDIR, "component_vogs.txt"), + unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta"), params: graph = GRAPH_FILE, hmmout = SMG_FILE, diff --git a/reneo/workflow/scripts/format_koverage_results.py b/reneo/workflow/scripts/format_koverage_results.py new file mode 100644 index 0000000..87f37f0 --- /dev/null +++ b/reneo/workflow/scripts/format_koverage_results.py @@ -0,0 +1,157 @@ +#!/usr/bin/python3 + +"""format_koverage_results.py: Format koverage results. + +""" + +import logging +import os +import subprocess +from Bio import SeqIO +from collections import defaultdict + +import pandas as pd + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2023, Reneo Project" +__license__ = "MIT" +__type__ = "Support Script" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "viji.mallawaarachchi@gmail.com" + + +def main(): + # Get arguments + # ----------------------- + + samples_file = snakemake.params.samples_file + koverage_tsv = snakemake.params.koverage_tsv + seq_file = snakemake.params.seq_file + info_file = snakemake.params.info_file + output_path = snakemake.params.output_path + log = snakemake.params.log + + # Setup logger + # ---------------------------------------------------------------------- + + logger = logging.getLogger("format_coverage") + logger.setLevel(logging.DEBUG) + logging.captureWarnings(True) + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + consoleHeader.setLevel(logging.INFO) + logger.addHandler(consoleHeader) + + # Setup output path for log file + if log is None: + fileHandler = logging.FileHandler(f"{log}") + else: + fileHandler = logging.FileHandler(f"{log}") + + fileHandler.setLevel(logging.DEBUG) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) + + # Validate inputs + # --------------------------------------------------- + + # Handle for missing trailing forwardslash in output folder path + if output_path[-1:] != "/": + output_path = output_path + "/" + + # Create output folder if it does not exist + if not os.path.isdir(output_path): + subprocess.run("mkdir -p " + output_path, shell=True) + + # Get sample-wise genome coverage stats + # ---------------------------------------------------------------------- + + # Log inputs + logger.info(f"Samples file: {samples_file}") + logger.info(f"Koverage results: {koverage_tsv}") + logger.info(f"Output path: {output_path}") + + # Get sample names + mysamples = [s.split("\t")[0] for s in open(samples_file, "r")] + logger.debug(mysamples) + + # Initialise dataframe + df_read_counts = pd.DataFrame(columns=["contig_reneo"] + mysamples) + df_rpkm = pd.DataFrame(columns=["contig_reneo"] + mysamples) + df_mean_cov = pd.DataFrame(columns=["contig_reneo"] + mysamples) + + # Get coverage stats of genomes in each sample + read_counts = defaultdict(lambda: defaultdict(list)) + rpkm = defaultdict(lambda: defaultdict(list)) + mean_cov = defaultdict(lambda: defaultdict(list)) + + with open(koverage_tsv, "r") as mf: + for line in mf.readlines()[1:]: + strings = line.strip().split("\t") + read_counts[strings[1]][strings[0]] = int(float(strings[2])) + rpkm[strings[1]][strings[0]] = float(strings[4]) + mean_cov[strings[1]][strings[0]] = float(strings[7]) + + # Add records to dataframe + counter = 0 + for genome in read_counts: + read_counts_row = read_counts[genome] + read_counts_row["contig_reneo"] = genome + read_counts_row = dict(read_counts_row) + read_counts_row_df = pd.DataFrame(read_counts_row, index=[counter]) + df_read_counts = pd.concat([df_read_counts, read_counts_row_df]) + + rpkm_row = rpkm[genome] + rpkm_row["contig_reneo"] = genome + rpkm_row = dict(rpkm_row) + rpkm_row_df = pd.DataFrame(rpkm_row, index=[counter]) + df_rpkm = pd.concat([df_rpkm, rpkm_row_df]) + + mean_cov_row = mean_cov[genome] + mean_cov_row["contig_reneo"] = genome + mean_cov_row = dict(mean_cov_row) + mean_cov_row_df = pd.DataFrame(mean_cov_row, index=[counter]) + df_mean_cov = pd.concat([df_mean_cov, mean_cov_row_df]) + + counter += 1 + + # Save dataframe to file + df_read_counts.to_csv( + f"{output_path}sample_genome_read_counts.tsv", sep="\t", index=False + ) + df_rpkm.to_csv(f"{output_path}sample_genome_rpkm.tsv", sep="\t", index=False) + df_mean_cov.to_csv( + f"{output_path}sample_genome_mean_coverage.tsv", sep="\t", index=False + ) + + logger.info( + f"Raw read counts mapped to resolved genomes can be found in {output_path}sample_genome_read_counts.tsv" + ) + logger.info( + f"RPKM values of resolved genomes can be found in {output_path}sample_genome_rpkm.tsv" + ) + logger.info( + f"Estimated mean read depth of resolved genomes can be found in {output_path}sample_genome_mean_coverage.tsv" + ) + + + # Make sequence information file + with open(info_file, "w") as myfile: + myfile.write(f"contig_reneo_name\tlength\tcontig_or_reneo\n") + for index, record in enumerate(SeqIO.parse(seq_file, "fasta")): + if "phage_comp" in record.id: + myfile.write(f"{record.id}\t{len(record.seq)}\treneo\n") + else: + myfile.write(f"{record.id}\t{len(record.seq)}\tcontig\n") + + logger.info(f"Sequence information file can be found in {info_file}") + + # Exit program + # -------------- + + logger.info("Thank you for using format_koverage_results!") + + +if __name__ == "__main__": + main()