-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
242 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
rule combine_genomes_and_unresolved_edges: | ||
"""Combine resolved genomes and unresolved edges""" | ||
input: | ||
genomes = RESOLVED_GENOMES, | ||
unresolved_edges = os.path.join(OUTDIR, "unresolved_virus_like_edges.fasta") | ||
output: | ||
os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta") | ||
shell: | ||
""" | ||
cat {input.genomes} {input.unresolved_edges} > {output} | ||
""" | ||
|
||
|
||
rule koverage_genomes: | ||
"""Get coverage statistics with Koverage""" | ||
input: | ||
tsv = os.path.join(OUTDIR,"reneo.samples.tsv"), | ||
sequences = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta") | ||
params: | ||
out_dir = OUTDIR, | ||
profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "", | ||
output: | ||
os.path.join(OUTDIR, "results", "sample_coverage.tsv") | ||
threads: | ||
config["resources"]["jobCPU"] | ||
resources: | ||
mem_mb = config["resources"]["jobMem"], | ||
mem = str(config["resources"]["jobMem"]) + "MB" | ||
conda: | ||
os.path.join("..", "envs", "koverage.yaml") | ||
shell: | ||
""" | ||
koverage run \ | ||
--reads {input.tsv} \ | ||
--ref {input.sequences} \ | ||
--threads {threads} \ | ||
--output {params.out_dir} \ | ||
{params.profile} | ||
""" | ||
|
||
|
||
rule koverage_postprocess: | ||
"""Format TSV of samples and reads from Koverage""" | ||
input: | ||
koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"), | ||
samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"), | ||
seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta") | ||
output: | ||
os.path.join(OUTDIR, "sample_genome_read_counts.tsv") | ||
params: | ||
koverage_tsv = os.path.join(OUTDIR, "results", "sample_coverage.tsv"), | ||
samples_file = os.path.join(OUTDIR, "reneo.samples.tsv"), | ||
seq_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges.fasta"), | ||
info_file = os.path.join(OUTDIR, "genomes_and_unresolved_edges_info.tsv"), | ||
output_path = OUTDIR, | ||
log = os.path.join(LOGSDIR, "format_koverage_results_output.log") | ||
log: | ||
os.path.join(LOGSDIR, "format_koverage_results_output.log") | ||
conda: | ||
os.path.join("..", "envs", "reneo.yaml") | ||
script: | ||
os.path.join("..", "scripts", "format_koverage_results.py") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
#!/usr/bin/python3 | ||
|
||
"""format_koverage_results.py: Format koverage results. | ||
""" | ||
|
||
import logging | ||
import os | ||
import subprocess | ||
from Bio import SeqIO | ||
from collections import defaultdict | ||
|
||
import pandas as pd | ||
|
||
__author__ = "Vijini Mallawaarachchi" | ||
__copyright__ = "Copyright 2023, Reneo Project" | ||
__license__ = "MIT" | ||
__type__ = "Support Script" | ||
__maintainer__ = "Vijini Mallawaarachchi" | ||
__email__ = "[email protected]" | ||
|
||
|
||
def main(): | ||
# Get arguments | ||
# ----------------------- | ||
|
||
samples_file = snakemake.params.samples_file | ||
koverage_tsv = snakemake.params.koverage_tsv | ||
seq_file = snakemake.params.seq_file | ||
info_file = snakemake.params.info_file | ||
output_path = snakemake.params.output_path | ||
log = snakemake.params.log | ||
|
||
# Setup logger | ||
# ---------------------------------------------------------------------- | ||
|
||
logger = logging.getLogger("format_coverage") | ||
logger.setLevel(logging.DEBUG) | ||
logging.captureWarnings(True) | ||
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") | ||
consoleHeader = logging.StreamHandler() | ||
consoleHeader.setFormatter(formatter) | ||
consoleHeader.setLevel(logging.INFO) | ||
logger.addHandler(consoleHeader) | ||
|
||
# Setup output path for log file | ||
if log is None: | ||
fileHandler = logging.FileHandler(f"{log}") | ||
else: | ||
fileHandler = logging.FileHandler(f"{log}") | ||
|
||
fileHandler.setLevel(logging.DEBUG) | ||
fileHandler.setFormatter(formatter) | ||
logger.addHandler(fileHandler) | ||
|
||
# Validate inputs | ||
# --------------------------------------------------- | ||
|
||
# Handle for missing trailing forwardslash in output folder path | ||
if output_path[-1:] != "/": | ||
output_path = output_path + "/" | ||
|
||
# Create output folder if it does not exist | ||
if not os.path.isdir(output_path): | ||
subprocess.run("mkdir -p " + output_path, shell=True) | ||
|
||
# Get sample-wise genome coverage stats | ||
# ---------------------------------------------------------------------- | ||
|
||
# Log inputs | ||
logger.info(f"Samples file: {samples_file}") | ||
logger.info(f"Koverage results: {koverage_tsv}") | ||
logger.info(f"Output path: {output_path}") | ||
|
||
# Get sample names | ||
mysamples = [s.split("\t")[0] for s in open(samples_file, "r")] | ||
logger.debug(mysamples) | ||
|
||
# Initialise dataframe | ||
df_read_counts = pd.DataFrame(columns=["contig_reneo"] + mysamples) | ||
df_rpkm = pd.DataFrame(columns=["contig_reneo"] + mysamples) | ||
df_mean_cov = pd.DataFrame(columns=["contig_reneo"] + mysamples) | ||
|
||
# Get coverage stats of genomes in each sample | ||
read_counts = defaultdict(lambda: defaultdict(list)) | ||
rpkm = defaultdict(lambda: defaultdict(list)) | ||
mean_cov = defaultdict(lambda: defaultdict(list)) | ||
|
||
with open(koverage_tsv, "r") as mf: | ||
for line in mf.readlines()[1:]: | ||
strings = line.strip().split("\t") | ||
read_counts[strings[1]][strings[0]] = int(float(strings[2])) | ||
rpkm[strings[1]][strings[0]] = float(strings[4]) | ||
mean_cov[strings[1]][strings[0]] = float(strings[7]) | ||
|
||
# Add records to dataframe | ||
counter = 0 | ||
for genome in read_counts: | ||
read_counts_row = read_counts[genome] | ||
read_counts_row["contig_reneo"] = genome | ||
read_counts_row = dict(read_counts_row) | ||
read_counts_row_df = pd.DataFrame(read_counts_row, index=[counter]) | ||
df_read_counts = pd.concat([df_read_counts, read_counts_row_df]) | ||
|
||
rpkm_row = rpkm[genome] | ||
rpkm_row["contig_reneo"] = genome | ||
rpkm_row = dict(rpkm_row) | ||
rpkm_row_df = pd.DataFrame(rpkm_row, index=[counter]) | ||
df_rpkm = pd.concat([df_rpkm, rpkm_row_df]) | ||
|
||
mean_cov_row = mean_cov[genome] | ||
mean_cov_row["contig_reneo"] = genome | ||
mean_cov_row = dict(mean_cov_row) | ||
mean_cov_row_df = pd.DataFrame(mean_cov_row, index=[counter]) | ||
df_mean_cov = pd.concat([df_mean_cov, mean_cov_row_df]) | ||
|
||
counter += 1 | ||
|
||
# Save dataframe to file | ||
df_read_counts.to_csv( | ||
f"{output_path}sample_genome_read_counts.tsv", sep="\t", index=False | ||
) | ||
df_rpkm.to_csv(f"{output_path}sample_genome_rpkm.tsv", sep="\t", index=False) | ||
df_mean_cov.to_csv( | ||
f"{output_path}sample_genome_mean_coverage.tsv", sep="\t", index=False | ||
) | ||
|
||
logger.info( | ||
f"Raw read counts mapped to resolved genomes can be found in {output_path}sample_genome_read_counts.tsv" | ||
) | ||
logger.info( | ||
f"RPKM values of resolved genomes can be found in {output_path}sample_genome_rpkm.tsv" | ||
) | ||
logger.info( | ||
f"Estimated mean read depth of resolved genomes can be found in {output_path}sample_genome_mean_coverage.tsv" | ||
) | ||
|
||
|
||
# Make sequence information file | ||
with open(info_file, "w") as myfile: | ||
myfile.write(f"contig_reneo_name\tlength\tcontig_or_reneo\n") | ||
for index, record in enumerate(SeqIO.parse(seq_file, "fasta")): | ||
if "phage_comp" in record.id: | ||
myfile.write(f"{record.id}\t{len(record.seq)}\treneo\n") | ||
else: | ||
myfile.write(f"{record.id}\t{len(record.seq)}\tcontig\n") | ||
|
||
logger.info(f"Sequence information file can be found in {info_file}") | ||
|
||
# Exit program | ||
# -------------- | ||
|
||
logger.info("Thank you for using format_koverage_results!") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |