Skip to content

Commit

Permalink
Fix drep add assembler flag (#26)
Browse files Browse the repository at this point in the history
* feat: add flag for switching off metaspades

* fix: rename bins with SID

* fix: align drep and CAT to using cohort instead of sample

* fix: align gunc with new drep outputs

* fix: a word

* fix: align bbmap rule to new drep outputs

* fix: drep input into bbmap rna to mags rule

* fix: add back dna sid to index input path

* fix: update gunc and gtdbtk rule all outputs

* fix: flip assembly mode value
  • Loading branch information
rroutsong authored Jul 30, 2024
1 parent 97e4de0 commit 17a982b
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 82 deletions.
32 changes: 15 additions & 17 deletions metamorph
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,13 @@ def run(sub_args):
sub_args,
config = config
)

config['bindpaths'] = bindpaths
# config['coassembly'] = sub_args.coa
config['coassembly'] = False

# Step 4b. Setup assembly mode
# modes: 0 - megahit + metaspades assembly
config["options"]["assembler_mode"] = "1" if sub_args.megahit_only else "0"

# Step 5. Save config to output directory
with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh:
json.dump(config, fh, indent = 4, sort_keys = True)
Expand Down Expand Up @@ -481,21 +483,6 @@ def parsed_arguments(name, description):
help = argparse.SUPPRESS
)

# a supported job scheduler, etc.
# subparser_run.add_argument(
# '-C', '--coa',
# action="store_true",
# required = False,
# help = argparse.SUPPRESS
# )

# subparser_run.add_argument(
# '-R', '--rnacoa',
# action="store_true",
# required = False,
# help = argparse.SUPPRESS
# )

# Name of master job
subparser_run.add_argument(
'--job-name',
Expand Down Expand Up @@ -558,6 +545,17 @@ def parsed_arguments(name, description):
# This is only applicable for
# local rules or when running
# in local mode.
subparser_run.add_argument(
'--megahit-only',
dest = 'megahit_only',
action = 'store_true',
required = False,
default = False,
help = argparse.SUPPRESS
)

# Selection of the assemblers used.
# Using
subparser_run.add_argument(
'--threads',
type = int,
Expand Down
13 changes: 7 additions & 6 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ datapath = config["project"]["datapath"]
rna_datapath = config["project"].get("rna_datapath", [])
workpath = config["project"]["workpath"]
tmpdir = config["options"]["tmp_dir"]
megahit_only = bool(int(config["options"]["assembler_mode"]))
coassemble = config['coassembly'] is True
rna_included = list_bool(config.get("rna", 'false'))
rna_coasm = str_bool(config["options"].get("rnacoa", 'False'))
Expand Down Expand Up @@ -76,8 +77,8 @@ rule all:
expand(join(top_tax_dir, "{name}", "kronagram.html"), name=samples),
expand(join(top_tax_dir, "{name}", "final_assembly.krak2"), name=samples),
expand(join(top_tax_dir, "{name}", "final_assembly.kraken2"), name=samples),
expand(join(top_tax_dir, "{name}", "GTDBTK_classify_wf"), name=samples),
expand(join(top_tax_dir, "{name}", "GUNC_detect"), name=samples),
expand(join(top_tax_dir, "GTDBTK_classify_wf"), name=samples),
expand(join(top_tax_dir, "GUNC_detect"), name=samples),
# assembly binning
# ~~~~~~~~~~~~~~~
expand(join(top_binning_dir, "{name}", "maxbin2_bins.contigs"), name=samples),
Expand All @@ -89,9 +90,9 @@ rule all:
expand(join(top_binning_dir, "{name}", "figures", "binning_results.png"), name=samples),
# bin refinement
# ~~~~~~~~~~~~~~~
expand(join(top_refine_dir, "{name}", "dRep", "data_tables", "Widb.csv"), name=samples),
expand(join(top_refine_dir, "{name}", "dRep", "figures", "Winning_genomes.pdf"), name=samples),
expand(join(top_refine_dir, "{name}", "dRep", "log", "cluster_arguments.json"), name=samples),
expand(join(top_refine_dir, "dRep", "data_tables", "Widb.csv"), name=samples),
expand(join(top_refine_dir, "dRep", "figures", "Winning_genomes.pdf"), name=samples),
expand(join(top_refine_dir, "dRep", "log", "cluster_arguments.json"), name=samples),
# bin statistics
# ~~~~~~~~~~~~~~~
expand(join(top_refine_dir, "{name}", "RefinedBins_summmary.txt"), name=samples),
Expand All @@ -104,7 +105,7 @@ rule all:
join(top_refine_dir, "cumulative_stats_metawrap.txt"),
# contig annotation
# ~~~~~~~~~~~~~~~
expand(join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.summary.txt"), name=samples),
expand(join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.summary.txt"), name=samples),
# BBMap mapping to MAGs
# ~~~~~~~~~~~~~~~
expand(join(top_mags_dir, "{name}", "index"), name=samples),
Expand Down
119 changes: 61 additions & 58 deletions workflow/rules/DNA.smk
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ top_mapping_dir = join(workpath, config['project']['id'], "mapping")
metawrap_container = config["containers"]["metawrap"]
pairedness = list(range(1, config['project']['nends']+1))
mem2int = lambda x: int(str(x).lower().replace('gb', '').replace('g', ''))
megahit_only = bool(int(config["options"]["assembler_mode"]))

"""
Step-wise pipeline outline:
Expand Down Expand Up @@ -147,7 +148,7 @@ rule metawrap_read_qc:
tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], 'read_qc', str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')),
tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], 'read_qc', str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')),
containerized: metawrap_container,
threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)),
threads: int(cluster["metawrap_read_qc"].get('threads', default_threads)),
shell:
"""
# safe temp directory
Expand Down Expand Up @@ -214,14 +215,14 @@ rule metawrap_genome_assembly:
output:
# megahit outputs
megahit_assembly = join(top_assembly_dir, "{name}", "megahit", "final.contigs.fa"),
megahit_longcontigs = join(top_assembly_dir, "{name}", "megahit", "long.contigs.fa"),
# megahit_longcontigs = join(top_assembly_dir, "{name}", "megahit", "long.contigs.fa"),
megahit_log = join(top_assembly_dir, "{name}", "megahit", "log"),
# metaspades outsputs
metaspades_assembly = join(top_assembly_dir, "{name}", "metaspades", "contigs.fasta"),
metaspades_graph = join(top_assembly_dir, "{name}", "metaspades", "assembly_graph.fastg"),
metaspades_longscaffolds = join(top_assembly_dir, "{name}", "metaspades", "long_scaffolds.fasta"),
metaspades_scaffolds = join(top_assembly_dir, "{name}", "metaspades", "scaffolds.fasta"),
metaspades_cor_reads = directory(join(top_assembly_dir, "{name}", "metaspades", "corrected")),
# metaspades outputs
metaspades_assembly = join(top_assembly_dir, "{name}", "metaspades", "contigs.fasta") if not megahit_only else [],
metaspades_graph = join(top_assembly_dir, "{name}", "metaspades", "assembly_graph.fastg") if not megahit_only else [],
metaspades_longscaffolds = join(top_assembly_dir, "{name}", "metaspades", "long_scaffolds.fasta") if not megahit_only else [],
metaspades_scaffolds = join(top_assembly_dir, "{name}", "metaspades", "scaffolds.fasta") if not megahit_only else [],
metaspades_cor_reads = directory(join(top_assembly_dir, "{name}", "metaspades", "corrected")) if not megahit_only else [],
# ensemble outputs
final_assembly = join(top_assembly_dir, "{name}", "final_assembly.fasta"),
final_assembly_report = join(top_assembly_dir, "{name}", "assembly_report.html"),
Expand All @@ -234,18 +235,20 @@ rule metawrap_genome_assembly:
mh_dir = join(top_assembly_dir, "{name}", "megahit"),
assembly_dir = join(top_assembly_dir, "{name}"),
contig_min_len = "1000",
megahit_only = megahit_only,
assemblers = "--megahit " if megahit_only else "--megahit --metaspades ",
threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)),
shell:
"""
# remove empty directories by snakemake, to prevent metawrap error
rm -rf {params.mh_dir:q}
if [ -d "{params.assembly_dir}" ]; then rm -rf "{params.assembly_dir}"; fi
mkdir -p "{params.assembly_dir}"
if [ -d {params.mh_dir:q} ]; then rm -rf {params.mh_dir:q}; fi
# link to the file names metawrap expects
ln -s {input.R1} {output.assembly_R1}
ln -s {input.R2} {output.assembly_R2}
# run genome assembler
mw assembly \
--megahit \
--metaspades \
{params.assemblers} \
-m {params.memlimit} \
-t {threads} \
-l {params.contig_min_len} \
Expand Down Expand Up @@ -417,6 +420,15 @@ rule metawrap_binning:
-x {params.max_perc_contam}
cp -r {params.tmp_binref_dir}/* {params.bin_dir}
bold=$(tput bold)
normal=$(tput sgr0)
for this_bin in `find {params.bin_dir} -type f -regex ".*/bin.[0-9][0-9]?[0-9]?.fa"`; do
fn=$(basename $this_bin)
to=$(dirname $this_bin)/{wildcards.name}_$fn
echo "relabeling ${{bold}}$fn${{normal}} to ${{bold}}{wildcards.name}_$fn${{normal}}"
mv $this_bin $to
done
touch {output.bin_breadcrumb}
chmod -R 775 {params.bin_dir}
"""
Expand Down Expand Up @@ -512,20 +524,21 @@ rule derep_bins:
directory of consensus ensemble bins (deterministic output)
"""
input:
bin_breadcrumb = join(top_binning_dir, "{name}", "{name}_BINNING_COMPLETE"),
bins = expand(join(top_binning_dir, "{name}", "{name}_BINNING_COMPLETE"), name=samples),
output:
derep_genome_info = join(top_refine_dir, "{name}", "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "{name}", "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "{name}", "dRep", "log", "cluster_arguments.json"),
derep_genome_dir = directory(join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes")),
derep_genome_info = join(top_refine_dir, "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "dRep", "log", "cluster_arguments.json"),
derep_genome_dir = directory(join(top_refine_dir, "dRep", "dereplicated_genomes")),
derep_dir = directory(join(top_refine_dir, "dRep")),
singularity: metawrap_container,
threads: int(cluster["derep_bins"].get("threads", default_threads)),
params:
rname = "derep_bins",
sid = "{name}",
tmpdir = config['options']['tmp_dir'],
derep_dir = join(top_refine_dir, "{name}", "dRep"),
metawrap_bins = join(top_binning_dir, "{name}", "metawrap_50_5_bins"),
bindir = join(top_binning_dir),
outto = join(top_refine_dir, "dRep"),
metawrap_dir_name = "metawrap_50_5_bins",
# -l LENGTH: Minimum genome length (default: 50000)
minimum_genome_length = "1000",
# -pa[--P_ani] P_ANI: ANI threshold to form primary (MASH) clusters (default: 0.9)
Expand All @@ -551,13 +564,8 @@ rule derep_bins:
dRep check_dependencies
# run drep
export DREP_BINS=$(ls {params.metawrap_bins}/* | tr '\\n' ' ')
printf "\ngenome list\n"
printf ${{DREP_BINS}}
printf "\n"
mkdir -p {params.derep_dir}
DREP_BINS=$(ls {params.bindir}/*/{params.metawrap_dir_name}/*.fa | tr '\\n' ' ')
mkdir -p {params.outto}
dRep dereplicate -d \
-g ${{DREP_BINS}} \
-p {threads} \
Expand All @@ -566,25 +574,23 @@ rule derep_bins:
-sa {params.ani_secondary_threshold} \
-nc {params.min_overlap} \
-cm {params.coverage_method} \
{params.derep_dir}
{params.outto}
"""


rule contig_annotation:
input:
derep_genome_info = join(top_refine_dir, "{name}", "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "{name}", "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "{name}", "dRep", "log", "cluster_arguments.json"),
dRep_dir = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"),
derep_genome_info = join(top_refine_dir, "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "dRep", "log", "cluster_arguments.json"),
dRep_dir = join(top_refine_dir, "dRep", "dereplicated_genomes"),
output:
cat_bin2cls_filename = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.txt"),
cat_bin2cls_official = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.official_names.txt"),
cat_bin2cls_summary = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.summary.txt"),
cat_bin2cls_filename = join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.txt"),
cat_bin2cls_official = join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.official_names.txt"),
cat_bin2cls_summary = join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.summary.txt"),
params:
cat_dir = join(top_refine_dir, "{name}", "contig_annotation"),
dRep_dir = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"),
cat_dir = join(top_refine_dir, "contig_annotation"),
rname = "contig_annotation",
sid = "{name}",
cat_db = "/data2/CAT", # from <root>/config/resources.json
tax_db = "/data2/CAT_tax", # from <root>/config/resourcs.json
diamond_exec = "/usr/bin/diamond",
Expand All @@ -594,7 +600,7 @@ rule contig_annotation:
"""
if [ ! -d "{params.cat_dir}" ]; then mkdir -p "{params.cat_dir}"; fi
cd {params.cat_dir} && CAT bins \
-b {params.dRep_dir} \
-b {input.dRep_dir} \
-d {params.cat_db} \
-t {params.tax_db} \
--path_to_diamond {params.diamond_exec} \
Expand All @@ -615,13 +621,13 @@ rule contig_annotation:

rule bbtools_index_map:
input:
derep_genome_info = join(top_refine_dir, "{name}", "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "{name}", "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "{name}", "dRep", "log", "cluster_arguments.json"),
dRep_dir = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"),
cat_bin2cls_filename = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.txt"),
cat_bing2cls_official = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.official_names.txt"),
cat_bing2cls_summary = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.summary.txt"),
derep_genome_info = join(top_refine_dir, "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "dRep", "log", "cluster_arguments.json"),
dRep_dir = join(top_refine_dir, "dRep", "dereplicated_genomes"),
cat_bin2cls_filename = join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.txt"),
cat_bing2cls_official = join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.official_names.txt"),
cat_bing2cls_summary = join(top_refine_dir, "contig_annotation", "out.BAT.bin2classification.summary.txt"),
R1 = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"),
R2 = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"),
output:
Expand Down Expand Up @@ -705,15 +711,13 @@ rule bbtools_index_map:

rule gtdbtk_classify:
input:
R1 = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"),
R2 = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"),
dRep_dir = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"),
dRep_dir = join(top_refine_dir, "dRep", "dereplicated_genomes"),
output:
gtdbk_dir = directory(join(top_tax_dir, "{name}", "GTDBTK_classify_wf"))
threads: int(cluster["gtdbk_classify"].get("threads", default_threads)),
gtdbk_dir = directory(join(top_tax_dir, "GTDBTK_classify_wf"))
threads:
int(cluster["gtdbk_classify"].get("threads", default_threads)),
params:
rname = "gtdbk_classify",
sid = "{name}",
tmp_safe_dir = join(config['options']['tmp_dir'], 'gtdbtk_classify'),
GTDBTK_DB = "/data2/GTDBTK_DB",
singularity: metawrap_container,
Expand Down Expand Up @@ -743,15 +747,14 @@ rule gtdbtk_classify:

rule gunc_detection:
input:
derep_genome_info = join(top_refine_dir, "{name}", "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "{name}", "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "{name}", "dRep", "log", "cluster_arguments.json"),
drep_genomes = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"),
derep_genome_info = join(top_refine_dir, "dRep", "data_tables", "Widb.csv"),
derep_winning_figure = join(top_refine_dir, "dRep", "figures", "Winning_genomes.pdf"),
derep_args = join(top_refine_dir, "dRep", "log", "cluster_arguments.json"),
drep_genomes = join(top_refine_dir, "dRep", "dereplicated_genomes"),
output:
GUNC_detect_out = directory(join(top_tax_dir, "{name}", "GUNC_detect"))
GUNC_detect_out = directory(join(top_tax_dir,"GUNC_detect"))
params:
rname = "gunc_detection",
sid = "{name}",
gunc_db = "/data2/GUNC_DB/gunc_db_progenomes2.1.dmnd", # from <root>/config/resourcs.json
tmp_safe_dir = join(config['options']['tmp_dir'], 'gunc_detect'),
singularity: metawrap_container,
Expand Down
2 changes: 1 addition & 1 deletion workflow/scripts/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_paired_dna(config, rna_sid):
top_refine_dir = join(config["project"]["workpath"], config['project']['id'], "metawrap_bin_refine")
top_mags_dir = join(config["project"]["workpath"], config['project']['id'], "mags")
# construct necessary linked dna artifacts
derep_reqs = join(top_refine_dir, dna_sid, "dRep", "dereplicated_genomes")
derep_reqs = join(top_refine_dir, "dRep", "dereplicated_genomes")
mag_index = join(top_mags_dir, dna_sid, "index")
return [derep_reqs, mag_index]

Expand Down

0 comments on commit 17a982b

Please sign in to comment.