Skip to content

Commit

Permalink
DEV: add postprocessing steps
Browse files Browse the repository at this point in the history
  • Loading branch information
Vini2 committed Aug 15, 2023
1 parent 4b684ea commit c122a6c
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 2 deletions.
1 change: 1 addition & 0 deletions reneo/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def cli():
all Run everything (default)
preprocess Run preprocessing only
reneo Run reneo (and preprocessing if needed)
postprocess Run postprocessing (with preprocessing and reneo if needed)
print_targets List available targets
"""

Expand Down
15 changes: 13 additions & 2 deletions reneo/workflow/reneo.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,16 @@ def targetRule(fn):
target_rules.append(fn.__name__[2:])
return fn

localrules: all, preprocess, reneo, koverage_tsv, print_stages
localrules: all, preprocess, reneo, koverage_tsv, postprocess, print_stages


"""Run stages"""
@targetRule
rule all:
input:
preprocessTargets,
reneoTargets
reneoTargets,
postprocessTargets


@targetRule
Expand All @@ -51,6 +52,12 @@ rule reneo:
reneoTargets


@targetRule
rule postprocess:
input:
postprocessTargets


@targetRule
rule print_stages:
run:
Expand All @@ -73,3 +80,7 @@ include: os.path.join("rules", "genes.smk")

# Step 5: Run Reneo
include: os.path.join("rules", "reneo.smk")


# Step 6: Postprocess genomes
include: os.path.join("rules", "postprocess.smk")
8 changes: 8 additions & 0 deletions reneo/workflow/rules/02_reneo_targets.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

preprocessTargets = []
reneoTargets = []
postprocessTargets = []


"""PREPROCESSING TARGETS"""
Expand Down Expand Up @@ -37,3 +38,10 @@ reneoTargets.append(RESOLVED_COMP_INFO)
COMP_VOGS = os.path.join(OUTDIR, "component_vogs.txt")
reneoTargets.append(COMP_VOGS)


"""POSTPROCESSING TARGETS"""
GENOME_KOVERAGE_RES = os.path.join(OUTDIR, "results", "sample_coverage.tsv")
postprocessTargets.append(GENOME_KOVERAGE_RES)

GENOME_READ_COUNTS = os.path.join(OUTDIR, "sample_genome_read_counts.tsv")
postprocessTargets.append(GENOME_READ_COUNTS)
62 changes: 62 additions & 0 deletions reneo/workflow/rules/postprocess.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
rule combine_genomes_and_unresolved_edges:
"""Combine resolved genomes and unresolved edges"""
input:
genomes = RESOLVED_GENOMES,
unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta")
output:
os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta")
shell:
"""
cat {input.genomes} {input.unresolved_edges} > {output}
"""


rule koverage_genomes:
"""Get coverage statistics with Koverage"""
input:
tsv = os.path.join(OUTDIR,"reneo.samples.tsv"),
sequences = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta")
params:
out_dir = OUTDIR,
profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "",
output:
os.path.join(OUTDIR, "results", "sample_coverage.tsv")
threads:
config["resources"]["jobCPU"]
resources:
mem_mb = config["resources"]["jobMem"],
mem = str(config["resources"]["jobMem"]) + "MB"
conda:
os.path.join("..", "envs", "koverage.yaml")
shell:
"""
koverage run \
--reads {input.tsv} \
--ref {input.sequences} \
--threads {threads} \
--output {params.out_dir} \
{params.profile}
"""


rule koverage_postprocess:
"""Format TSV of samples and reads from Koverage"""
input:
koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"),
samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"),
seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta")
output:
os.path.join(OUTDIR, "sample_genome_read_counts.tsv")
params:
koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"),
samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"),
seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta"),
info_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges_info.tsv"),
output_path = OUTDIR,
log = os.path.join(LOGSDIR, "format_koverage_results_output.log")
log:
os.path.join(LOGSDIR, "format_koverage_results_output.log")
conda:
os.path.join("..", "envs", "reneo.yaml")
script:
os.path.join("..", "scripts", "format_koverage_results.py")
1 change: 1 addition & 0 deletions reneo/workflow/rules/reneo.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rule run_reneo:
unitigs = os.path.join(OUTDIR, "resolved_edges.fasta"),
component_info = os.path.join(OUTDIR, "resolved_component_info.txt"),
vog_comp_info = os.path.join(OUTDIR, "component_vogs.txt"),
unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta"),
params:
graph = GRAPH_FILE,
hmmout = SMG_FILE,
Expand Down
157 changes: 157 additions & 0 deletions reneo/workflow/scripts/format_koverage_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/usr/bin/python3

"""format_koverage_results.py: Format koverage results.
"""

import logging
import os
import subprocess
from Bio import SeqIO
from collections import defaultdict

import pandas as pd

__author__ = "Vijini Mallawaarachchi"
__copyright__ = "Copyright 2023, Reneo Project"
__license__ = "MIT"
__type__ = "Support Script"
__maintainer__ = "Vijini Mallawaarachchi"
__email__ = "[email protected]"


def main():
# Get arguments
# -----------------------

samples_file = snakemake.params.samples_file
koverage_tsv = snakemake.params.koverage_tsv
seq_file = snakemake.params.seq_file
info_file = snakemake.params.info_file
output_path = snakemake.params.output_path
log = snakemake.params.log

# Setup logger
# ----------------------------------------------------------------------

logger = logging.getLogger("format_coverage")
logger.setLevel(logging.DEBUG)
logging.captureWarnings(True)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
consoleHeader = logging.StreamHandler()
consoleHeader.setFormatter(formatter)
consoleHeader.setLevel(logging.INFO)
logger.addHandler(consoleHeader)

# Setup output path for log file
if log is None:
fileHandler = logging.FileHandler(f"{log}")
else:
fileHandler = logging.FileHandler(f"{log}")

fileHandler.setLevel(logging.DEBUG)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)

# Validate inputs
# ---------------------------------------------------

# Handle for missing trailing forwardslash in output folder path
if output_path[-1:] != "/":
output_path = output_path + "/"

# Create output folder if it does not exist
if not os.path.isdir(output_path):
subprocess.run("mkdir -p " + output_path, shell=True)

# Get sample-wise genome coverage stats
# ----------------------------------------------------------------------

# Log inputs
logger.info(f"Samples file: {samples_file}")
logger.info(f"Koverage results: {koverage_tsv}")
logger.info(f"Output path: {output_path}")

# Get sample names
mysamples = [s.split("\t")[0] for s in open(samples_file, "r")]
logger.debug(mysamples)

# Initialise dataframe
df_read_counts = pd.DataFrame(columns=["contig_reneo"] + mysamples)
df_rpkm = pd.DataFrame(columns=["contig_reneo"] + mysamples)
df_mean_cov = pd.DataFrame(columns=["contig_reneo"] + mysamples)

# Get coverage stats of genomes in each sample
read_counts = defaultdict(lambda: defaultdict(list))
rpkm = defaultdict(lambda: defaultdict(list))
mean_cov = defaultdict(lambda: defaultdict(list))

with open(koverage_tsv, "r") as mf:
for line in mf.readlines()[1:]:
strings = line.strip().split("\t")
read_counts[strings[1]][strings[0]] = int(float(strings[2]))
rpkm[strings[1]][strings[0]] = float(strings[4])
mean_cov[strings[1]][strings[0]] = float(strings[7])

# Add records to dataframe
counter = 0
for genome in read_counts:
read_counts_row = read_counts[genome]
read_counts_row["contig_reneo"] = genome
read_counts_row = dict(read_counts_row)
read_counts_row_df = pd.DataFrame(read_counts_row, index=[counter])
df_read_counts = pd.concat([df_read_counts, read_counts_row_df])

rpkm_row = rpkm[genome]
rpkm_row["contig_reneo"] = genome
rpkm_row = dict(rpkm_row)
rpkm_row_df = pd.DataFrame(rpkm_row, index=[counter])
df_rpkm = pd.concat([df_rpkm, rpkm_row_df])

mean_cov_row = mean_cov[genome]
mean_cov_row["contig_reneo"] = genome
mean_cov_row = dict(mean_cov_row)
mean_cov_row_df = pd.DataFrame(mean_cov_row, index=[counter])
df_mean_cov = pd.concat([df_mean_cov, mean_cov_row_df])

counter += 1

# Save dataframe to file
df_read_counts.to_csv(
f"{output_path}sample_genome_read_counts.tsv", sep="\t", index=False
)
df_rpkm.to_csv(f"{output_path}sample_genome_rpkm.tsv", sep="\t", index=False)
df_mean_cov.to_csv(
f"{output_path}sample_genome_mean_coverage.tsv", sep="\t", index=False
)

logger.info(
f"Raw read counts mapped to resolved genomes can be found in {output_path}sample_genome_read_counts.tsv"
)
logger.info(
f"RPKM values of resolved genomes can be found in {output_path}sample_genome_rpkm.tsv"
)
logger.info(
f"Estimated mean read depth of resolved genomes can be found in {output_path}sample_genome_mean_coverage.tsv"
)


# Make sequence information file
with open(info_file, "w") as myfile:
myfile.write(f"contig_reneo_name\tlength\tcontig_or_reneo\n")
for index, record in enumerate(SeqIO.parse(seq_file, "fasta")):
if "phage_comp" in record.id:
myfile.write(f"{record.id}\t{len(record.seq)}\treneo\n")
else:
myfile.write(f"{record.id}\t{len(record.seq)}\tcontig\n")

logger.info(f"Sequence information file can be found in {info_file}")

# Exit program
# --------------

logger.info("Thank you for using format_koverage_results!")


if __name__ == "__main__":
main()

0 comments on commit c122a6c

Please sign in to comment.