DEV: add postprocessing steps

Vini2 · Aug 15, 2023 · c122a6c · c122a6c
1 parent 4b684ea
commit c122a6c
Show file tree

Hide file tree

Showing 6 changed files with 242 additions and 2 deletions.
diff --git a/reneo/__main__.py b/reneo/__main__.py
@@ -110,6 +110,7 @@ def cli():
     all             Run everything (default)
     preprocess      Run preprocessing only
     reneo           Run reneo (and preprocessing if needed)
+    postprocess     Run postprocessing (with preprocessing and reneo if needed)
     print_targets   List available targets
 """
 

diff --git a/reneo/workflow/reneo.smk b/reneo/workflow/reneo.smk
@@ -28,15 +28,16 @@ def targetRule(fn):
     target_rules.append(fn.__name__[2:])
     return fn
 
-localrules: all, preprocess, reneo, koverage_tsv, print_stages
+localrules: all, preprocess, reneo, koverage_tsv, postprocess, print_stages
 
 
 """Run stages"""
 @targetRule
 rule all:
     input:
         preprocessTargets,
-        reneoTargets
+        reneoTargets,
+        postprocessTargets
 
 
 @targetRule
@@ -51,6 +52,12 @@ rule reneo:
         reneoTargets
 
 
+@targetRule
+rule postprocess:
+    input:
+        postprocessTargets
+
+
 @targetRule
 rule print_stages:
     run:
@@ -73,3 +80,7 @@ include: os.path.join("rules", "genes.smk")
 
 # Step 5: Run Reneo
 include: os.path.join("rules", "reneo.smk")
+
+
+# Step 6: Postprocess genomes
+include: os.path.join("rules", "postprocess.smk")
diff --git a/reneo/workflow/rules/02_reneo_targets.smk b/reneo/workflow/rules/02_reneo_targets.smk
@@ -1,6 +1,7 @@
 
 preprocessTargets = []
 reneoTargets = []
+postprocessTargets = []
 
 
 """PREPROCESSING TARGETS"""
@@ -37,3 +38,10 @@ reneoTargets.append(RESOLVED_COMP_INFO)
 COMP_VOGS = os.path.join(OUTDIR, "component_vogs.txt")
 reneoTargets.append(COMP_VOGS)
 
+
+"""POSTPROCESSING TARGETS"""
+GENOME_KOVERAGE_RES = os.path.join(OUTDIR, "results", "sample_coverage.tsv")
+postprocessTargets.append(GENOME_KOVERAGE_RES)
+
+GENOME_READ_COUNTS = os.path.join(OUTDIR, "sample_genome_read_counts.tsv")
+postprocessTargets.append(GENOME_READ_COUNTS)
diff --git a/reneo/workflow/rules/postprocess.smk b/reneo/workflow/rules/postprocess.smk
@@ -0,0 +1,62 @@
+rule combine_genomes_and_unresolved_edges:
+    """Combine resolved genomes and unresolved edges"""
+    input:
+        genomes = RESOLVED_GENOMES,
+        unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta")
+    output:
+        os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta")
+    shell:
+        """
+        cat {input.genomes} {input.unresolved_edges} > {output}
+        """
+
+
+rule koverage_genomes:
+    """Get coverage statistics with Koverage"""
+    input:
+        tsv = os.path.join(OUTDIR,"reneo.samples.tsv"),
+        sequences = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta")
+    params:
+        out_dir = OUTDIR,
+        profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "",
+    output:
+        os.path.join(OUTDIR, "results", "sample_coverage.tsv")
+    threads:
+        config["resources"]["jobCPU"]
+    resources:
+        mem_mb = config["resources"]["jobMem"],
+        mem = str(config["resources"]["jobMem"]) + "MB"
+    conda:
+        os.path.join("..", "envs", "koverage.yaml")
+    shell:
+        """
+        koverage run \
+            --reads {input.tsv} \
+            --ref {input.sequences} \
+            --threads {threads} \
+            --output {params.out_dir} \
+            {params.profile}
+        """
+
+
+rule koverage_postprocess:
+    """Format TSV of samples and reads from Koverage"""
+    input:
+        koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"),
+        samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"),
+        seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta")
+    output:
+        os.path.join(OUTDIR, "sample_genome_read_counts.tsv")
+    params:
+        koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"),
+        samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"),
+        seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta"),
+        info_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges_info.tsv"),
+        output_path = OUTDIR,
+        log = os.path.join(LOGSDIR, "format_koverage_results_output.log")
+    log:
+        os.path.join(LOGSDIR, "format_koverage_results_output.log")
+    conda:
+        os.path.join("..", "envs", "reneo.yaml")
+    script:
+        os.path.join("..", "scripts", "format_koverage_results.py")
diff --git a/reneo/workflow/rules/reneo.smk b/reneo/workflow/rules/reneo.smk
@@ -12,6 +12,7 @@ rule run_reneo:
         unitigs = os.path.join(OUTDIR, "resolved_edges.fasta"),
         component_info = os.path.join(OUTDIR, "resolved_component_info.txt"),
         vog_comp_info = os.path.join(OUTDIR, "component_vogs.txt"),
+        unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta"),
     params:
         graph = GRAPH_FILE,
         hmmout = SMG_FILE,

diff --git a/reneo/workflow/scripts/format_koverage_results.py b/reneo/workflow/scripts/format_koverage_results.py
@@ -0,0 +1,157 @@
+#!/usr/bin/python3
+
+"""format_koverage_results.py: Format koverage results.
+
+"""
+
+import logging
+import os
+import subprocess
+from Bio import SeqIO
+from collections import defaultdict
+
+import pandas as pd
+
+__author__ = "Vijini Mallawaarachchi"
+__copyright__ = "Copyright 2023, Reneo Project"
+__license__ = "MIT"
+__type__ = "Support Script"
+__maintainer__ = "Vijini Mallawaarachchi"
+__email__ = "[email protected]"
+
+
+def main():
+    # Get arguments
+    # -----------------------
+
+    samples_file = snakemake.params.samples_file
+    koverage_tsv = snakemake.params.koverage_tsv
+    seq_file = snakemake.params.seq_file
+    info_file = snakemake.params.info_file
+    output_path = snakemake.params.output_path
+    log = snakemake.params.log
+
+    # Setup logger
+    # ----------------------------------------------------------------------
+
+    logger = logging.getLogger("format_coverage")
+    logger.setLevel(logging.DEBUG)
+    logging.captureWarnings(True)
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    consoleHeader = logging.StreamHandler()
+    consoleHeader.setFormatter(formatter)
+    consoleHeader.setLevel(logging.INFO)
+    logger.addHandler(consoleHeader)
+
+    # Setup output path for log file
+    if log is None:
+        fileHandler = logging.FileHandler(f"{log}")
+    else:
+        fileHandler = logging.FileHandler(f"{log}")
+
+    fileHandler.setLevel(logging.DEBUG)
+    fileHandler.setFormatter(formatter)
+    logger.addHandler(fileHandler)
+
+    # Validate inputs
+    # ---------------------------------------------------
+
+    # Handle for missing trailing forwardslash in output folder path
+    if output_path[-1:] != "/":
+        output_path = output_path + "/"
+
+    # Create output folder if it does not exist
+    if not os.path.isdir(output_path):
+        subprocess.run("mkdir -p " + output_path, shell=True)
+
+    # Get sample-wise genome coverage stats
+    # ----------------------------------------------------------------------
+
+    # Log inputs
+    logger.info(f"Samples file: {samples_file}")
+    logger.info(f"Koverage results: {koverage_tsv}")
+    logger.info(f"Output path: {output_path}")
+
+    # Get sample names
+    mysamples = [s.split("\t")[0] for s in open(samples_file, "r")]
+    logger.debug(mysamples)
+
+    # Initialise dataframe
+    df_read_counts = pd.DataFrame(columns=["contig_reneo"] + mysamples)
+    df_rpkm = pd.DataFrame(columns=["contig_reneo"] + mysamples)
+    df_mean_cov = pd.DataFrame(columns=["contig_reneo"] + mysamples)
+
+    # Get coverage stats of genomes in each sample
+    read_counts = defaultdict(lambda: defaultdict(list))
+    rpkm = defaultdict(lambda: defaultdict(list))
+    mean_cov = defaultdict(lambda: defaultdict(list))
+
+    with open(koverage_tsv, "r") as mf:
+        for line in mf.readlines()[1:]:
+            strings = line.strip().split("\t")
+            read_counts[strings[1]][strings[0]] = int(float(strings[2]))
+            rpkm[strings[1]][strings[0]] = float(strings[4])
+            mean_cov[strings[1]][strings[0]] = float(strings[7])
+
+    # Add records to dataframe
+    counter = 0
+    for genome in read_counts:
+        read_counts_row = read_counts[genome]
+        read_counts_row["contig_reneo"] = genome
+        read_counts_row = dict(read_counts_row)
+        read_counts_row_df = pd.DataFrame(read_counts_row, index=[counter])
+        df_read_counts = pd.concat([df_read_counts, read_counts_row_df])
+
+        rpkm_row = rpkm[genome]
+        rpkm_row["contig_reneo"] = genome
+        rpkm_row = dict(rpkm_row)
+        rpkm_row_df = pd.DataFrame(rpkm_row, index=[counter])
+        df_rpkm = pd.concat([df_rpkm, rpkm_row_df])
+
+        mean_cov_row = mean_cov[genome]
+        mean_cov_row["contig_reneo"] = genome
+        mean_cov_row = dict(mean_cov_row)
+        mean_cov_row_df = pd.DataFrame(mean_cov_row, index=[counter])
+        df_mean_cov = pd.concat([df_mean_cov, mean_cov_row_df])
+
+        counter += 1
+
+    # Save dataframe to file
+    df_read_counts.to_csv(
+        f"{output_path}sample_genome_read_counts.tsv", sep="\t", index=False
+    )
+    df_rpkm.to_csv(f"{output_path}sample_genome_rpkm.tsv", sep="\t", index=False)
+    df_mean_cov.to_csv(
+        f"{output_path}sample_genome_mean_coverage.tsv", sep="\t", index=False
+    )
+
+    logger.info(
+        f"Raw read counts mapped to resolved genomes can be found in {output_path}sample_genome_read_counts.tsv"
+    )
+    logger.info(
+        f"RPKM values of resolved genomes can be found in {output_path}sample_genome_rpkm.tsv"
+    )
+    logger.info(
+        f"Estimated mean read depth of resolved genomes can be found in {output_path}sample_genome_mean_coverage.tsv"
+    )
+
+
+    # Make sequence information file
+    with open(info_file, "w") as myfile:
+        myfile.write(f"contig_reneo_name\tlength\tcontig_or_reneo\n")
+        for index, record in enumerate(SeqIO.parse(seq_file, "fasta")):
+            if "phage_comp" in record.id:
+                myfile.write(f"{record.id}\t{len(record.seq)}\treneo\n")
+            else:
+                myfile.write(f"{record.id}\t{len(record.seq)}\tcontig\n")
+
+    logger.info(f"Sequence information file can be found in {info_file}")
+
+    # Exit program
+    # --------------
+
+    logger.info("Thank you for using format_koverage_results!")
+
+
+if __name__ == "__main__":
+    main()