diff --git a/CITATION.cff b/CITATION.cff index acd533fb..30a3c101 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -21,4 +21,5 @@ authors: orcid: https://orcid.org/0000-0003-0670-7480 title: "Aviary: Hybrid assembly and genome recovery from metagenomes with Aviary" version: 0.8.3 +doi: 10.5281/zenodo.10158087 date-released: 2023-11-20 \ No newline at end of file diff --git a/README.md b/README.md index c8391c44..7f7c8daf 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/aviary/README.html) +![](https://anaconda.org/bioconda/aviary/badges/license.svg) +![](https://anaconda.org/bioconda/aviary/badges/version.svg) +![](https://anaconda.org/bioconda/aviary/badges/latest_release_relative_date.svg) +![](https://anaconda.org/bioconda/aviary/badges/platforms.svg) +[![DOI](https://zenodo.org/badge/271448699.svg)](https://zenodo.org/doi/10.5281/zenodo.10158086) + + ![](docs/_include/images/aviary_logo.png) # Aviary @@ -110,7 +118,7 @@ ask you to set these environment variables upon first running and if they are no the `aviary configure` subcommand to reset the environment variables: ```commandline -aviary configure -o logs/ --eggnog-db-path /shared/db/eggnog/ --gtdb-path /shared/db/gtdb/ --checkm2-db-path /shared/db/checkm2db/ --download +aviary configure -o logs/ --eggnog-db-path /shared/db/eggnog/ --gtdb-path /shared/db/gtdb/ --checkm2-db-path /shared/db/checkm2db/ --singlem-metapackage-path /shared/db/singlem/ --download ``` This command will check if the databases exist at those given locations, if they don't then aviary will download and change diff --git a/aviary/__init__.py b/aviary/__init__.py index 732155f8..3e2f46a3 100644 --- a/aviary/__init__.py +++ b/aviary/__init__.py @@ -1 +1 @@ -__version__ = "0.8.3" +__version__ = "0.9.0" diff --git a/aviary/aviary.py b/aviary/aviary.py index 56e42eff..1e88a6e1 100755 --- a/aviary/aviary.py +++ b/aviary/aviary.py @@ -598,23 +598,65 @@ def main(): default=3 ) + binning_group.add_argument( + '--extra-binners', '--extra_binners', '--extra-binner', '--extra_binner', + help='Optional list of extra binning algorithms to run. Can be any combination of: \n' + 'maxbin, maxbin2, concoct \n' + 'These binners are skipped by default as they can have long runtimes \n' + 'N.B. specifying "maxbin" and "maxbin2" are equivalent \n', + dest='extra_binners', + nargs='*', + choices=["maxbin", "maxbin2", "concoct"] + ) + binning_group.add_argument( '--skip-binners', '--skip_binners', '--skip_binner', '--skip-binner', help='Optional list of binning algorithms to skip. Can be any combination of: \n' - 'rosella, semibin, metabat1, metabat2, metabat, vamb, concoct, maxbin2, maxbin \n' - 'Capitals will be auto-corrected. N.B. specifying "metabat" will skip both \n' - 'MetaBAT1 and MetaBAT2.', + 'rosella, semibin, metabat1, metabat2, metabat, vamb \n' + 'N.B. specifying "metabat" will skip both MetaBAT1 and MetaBAT2. \n', dest='skip_binners', - nargs='*' - # default=["maxbin2"] + nargs='*', + choices=["rosella", "semibin", "metabat1", "metabat2", "metabat", "vamb"] + ) + + binning_group.add_argument( + '--binning-only', '--binning_only', + help='Only run up to the binning stage. Do not run SingleM, GTDB-tk, or CoverM', + type=str2bool, + nargs='?', + const=True, + dest='binning_only', + default=False, ) binning_group.add_argument( '--skip-abundances', '--skip_abundances', help='Skip CoverM post-binning abundance calculations.', dest='skip_abundances', + type=str2bool, + nargs='?', + const=True, + default=False, + ) + + binning_group.add_argument( + '--skip-taxonomy', '--skip_taxonomy', + help='Skip GTDB-tk post-binning taxonomy assignment.', + dest='skip_taxonomy', + type=str2bool, + nargs='?', + const=True, + default=False, + ) + + binning_group.add_argument( + '--skip-singlem', '--skip_singlem', + help='Skip SingleM post-binning recovery assessment.', + dest='skip_singlem', + type=str2bool, + nargs='?', + const=True, default=False, - action="store_true", ) #################################################################### diff --git a/aviary/envs/checkm2.yaml b/aviary/envs/checkm2.yaml index 84a03d8e..04c4ecc5 100755 --- a/aviary/envs/checkm2.yaml +++ b/aviary/envs/checkm2.yaml @@ -3,16 +3,16 @@ channels: - bioconda - defaults dependencies: - - python>=3.6, <3.9 - - scikit-learn=0.23.2 - - h5py=2.10.0 - - numpy=1.21.6 - - diamond=2.0.4 - - tensorflow >= 2.1.0, <=2.6 + - python >=3.7, <3.9 + - scikit-learn =0.23.2 + - h5py =2.10.0 + - numpy =1.19.2 + - diamond =2.0.4 + - tensorflow >= 2.2.0, <2.6.0 - lightgbm =3.2.1 - - pandas >=1.4.0, <2.0 - - scipy - - prodigal >=2.6.3 + - pandas =1.4.0 + - scipy =1.8.0 + - prodigal =2.6.3 - setuptools - requests - packaging diff --git a/aviary/modules/annotation/annotation.smk b/aviary/modules/annotation/annotation.smk index 6b2ca96a..99190c93 100644 --- a/aviary/modules/annotation/annotation.smk +++ b/aviary/modules/annotation/annotation.smk @@ -99,7 +99,7 @@ rule download_gtdb: # Uncompress and pipe output to TQDM 'echo "[INFO] - Extracting archive..."; ' - 'tar xvzf "$TARGET_TAR" -C "${{TARGET_DIR}}" --strip 1; ' + 'tar -xvzf "$TARGET_TAR" -C "${{TARGET_DIR}}" --strip 1; ' # Remove the file after successful extraction 'rm "$TARGET_TAR"; ' @@ -122,7 +122,7 @@ rule download_singlem_metapackage: 'logs/download_singlem.log' shell: 'singlem data --output-directory {params.metapackage_folder}_tmp 2> {log} && ' - 'mv {params.metapackage_folder}_tmp/*.smpkg.zb/payload_directory {params.metapackage_folder}' + 'mv {params.metapackage_folder}_tmp/*.smpkg.zb {params.metapackage_folder}' rule download_checkm2: params: diff --git a/aviary/modules/binning/binning.smk b/aviary/modules/binning/binning.smk index eb514c38..ed8593b2 100644 --- a/aviary/modules/binning/binning.smk +++ b/aviary/modules/binning/binning.smk @@ -696,7 +696,7 @@ rule finalise_stats: checkm1_done = "bins/checkm.out", checkm2_done = "bins/checkm2_output/quality_report.tsv", coverage_file = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [], - gtdbtk_done = "data/gtdbtk/done" + gtdbtk_done = "data/gtdbtk/done" if not config["skip_taxonomy"] else [] output: bin_stats = "bins/bin_info.tsv", checkm_minimal = "bins/checkm_minimal.tsv" @@ -732,12 +732,14 @@ rule checkm_das_tool: rule singlem_pipe_reads: output: "data/singlem_out/metagenome.combined_otu_table.csv" + params: + package_path = os.environ["SINGLEM_METAPACKAGE_PATH"] threads: min(config["max_threads"], 48) resources: mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 8*1024*attempt), runtime = lambda wildcards, attempt: 12*60*attempt, log: - "data/singlem_out/singlem_reads_log.txt" + "logs/singlem_pipe_reads_log.txt" conda: "../../envs/singlem.yaml" script: @@ -745,14 +747,19 @@ rule singlem_pipe_reads: rule singlem_appraise: input: - metagenome = "data/singlem_out/metagenome.combined_otu_table.csv", - gtdbtk_done = "data/gtdbtk/done", + pipe_results = "data/singlem_out/metagenome.combined_otu_table.csv", + assembly = config["fasta"], + # gtdbtk_done = "data/gtdbtk/done", bins_complete = "bins/checkm.out" output: - "data/singlem_out/singlem_appraisal.tsv" + binned = "data/singlem_out/binned.otu_table.csv", + unbinned = "data/singlem_out/unbinned.otu_table.csv", + plot = "data/singlem_out/singlem_appraise.svg", + assembled = "data/singlem_out/assembled.otu_table.csv", + singlem = "data/singlem_out/singlem_appraisal.tsv" params: - pplacer_threads = config['pplacer_threads'], - fasta = config['fasta'] + package_path = os.environ["SINGLEM_METAPACKAGE_PATH"], + genomes_folder = "data/refined_bins/final_bins/" threads: min(config["max_threads"], 48) resources: mem_mb = lambda wildcards, attempt: min(int(config["max_memory"])*1024, 8*1024*attempt), @@ -760,61 +767,41 @@ rule singlem_appraise: conda: "../../envs/singlem.yaml" log: - "data/singlem_out/singlem_log.txt" - shell: - # We use bash -c so that a non-zero exitstatus of the non-final commands doesn't cause the rule (and therefore aviary) to fail - "bash -c 'singlem pipe --threads {threads} --genome-fasta-file bins/final_bins/*.fna --otu-table data/singlem_out/genomes.otu_table.csv && " - "singlem pipe --threads {threads} --genome-fasta-file {params.fasta} --otu-table data/singlem_out/assembly.otu_table.csv && " - "singlem appraise --metagenome-otu-tables {input.metagenome} --genome-otu-tables data/singlem_out/genomes.otu_table.csv " - "--assembly-otu-table data/singlem_out/assembly.otu_table.csv " - "--plot data/singlem_out/singlem_appraise.svg --output-binned-otu-table data/singlem_out/binned.otu_table.csv " - "--output-unbinned-otu-table data/singlem_out/unbinned.otu_table.csv > data/singlem_out/singlem_appraisal.tsv' 2> {log} || " - "echo 'SingleM Errored, please check data/singlem_out/singlem_log.txt'; touch data/singlem_out/singlem_appraisal.tsv" - + "logs/singlem_appraise_log.txt" + script: + "../../scripts/singlem_appraise.py" rule recover_mags: input: final_bins = "bins/bin_info.tsv", - gtdbtk = "data/gtdbtk/done", + gtdbtk = "data/gtdbtk/done" if not config["skip_taxonomy"] else [], coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [], - singlem = "data/singlem_out/singlem_appraisal.tsv" + contig_coverage = "data/coverm.cov", + singlem = "data/singlem_out/singlem_appraisal.tsv" if not config["skip_singlem"] else [], conda: "../../envs/coverm.yaml" output: - bins = "bins/done", - diversity = 'diversity/done' + bins = "bins/done" threads: config["max_threads"] - shell: - "cd bins/; " - "ln -s ../data/coverm_abundances.tsv ./; " - "ln -s ../data/coverm.cov ./; " - "cd ../; " - "ln -sr data/singlem_out/ diversity || echo 'SingleM linked'; " - "ln -sr data/gtdbtk taxonomy || echo 'GTDB-tk linked'; " - "touch bins/done; " - "touch diversity/done; " - "rm -f data/binning_bams/*bam; " - "rm -f data/binning_bams/*bai; " + script: + "scripts/finalise_recovery.py" rule recover_mags_no_singlem: input: final_bins = "bins/bin_info.tsv", + gtdbtk = [], coverm = "data/coverm_abundances.tsv" if not config["skip_abundances"] else [], + contig_coverage = "data/coverm.cov", + singlem = [], conda: "../../envs/coverm.yaml" output: bins = "bins/done", threads: config["max_threads"] - shell: - "cd bins/; " - "ln -s ../data/coverm_abundances.tsv ./; " - "ln -s ../data/coverm.cov ./; " - "cd ../; " - "touch bins/done; " - "rm -f data/binning_bams/*bam; " - "rm -f data/binning_bams/*bai; " + script: + "scripts/finalise_recovery.py" # Special rule to help out with a buggy output rule dereplicate_and_get_abundances_paired: diff --git a/aviary/modules/binning/envs/rosella.yaml b/aviary/modules/binning/envs/rosella.yaml index ee398172..8be01ff4 100644 --- a/aviary/modules/binning/envs/rosella.yaml +++ b/aviary/modules/binning/envs/rosella.yaml @@ -2,31 +2,8 @@ channels: - conda-forge - numba - bioconda - - defaults dependencies: - - python >= 3.8, <= 3.10 - - gcc - - cxx-compiler - - rosella >= 0.5.2 - - numba >= 0.53, <= 0.57 - - numpy <= 1.24 - - joblib >= 1.1.0, <= 1.3 - - scikit-bio >= 0.5.7 - - umap-learn >= 0.5.3 - - scipy <= 1.11 - - pandas >= 1.3 - - pynndescent >= 0.5.7 - - hdbscan >= 0.8.28 - - scikit-learn >= 1.0.2, <= 1.1 - - flight-genome >= 1.6.1 + - rosella >= 0.5.3 + - flight-genome >= 1.6.3 - coverm >= 0.6.1 - - seaborn - - imageio - - matplotlib - - tqdm - - tbb - - joblib - - pebble - - threadpoolctl - - biopython - checkm-genome==1.1.3 diff --git a/aviary/modules/binning/scripts/finalise_recovery.py b/aviary/modules/binning/scripts/finalise_recovery.py new file mode 100644 index 00000000..444f7aa1 --- /dev/null +++ b/aviary/modules/binning/scripts/finalise_recovery.py @@ -0,0 +1,44 @@ +import os +from pathlib import Path +import glob + +def check_and_remove_base_file(file_path) -> str: + file_name = os.path.basename(file_path) + if os.path.exists(file_name): + os.remove(file_name) + + return file_name + + +if __name__ == '__main__': + final_bins = snakemake.input.final_bins + coverage_file = snakemake.input.coverm + contig_coverage = snakemake.input.contig_coverage + gtdbtk = snakemake.input.gtdbtk + singlem = snakemake.input.singlem + + output_taxonomy = "taxonomy" + output_singlem = "diversity" + + os.chdir('bins/') + + if len(coverage_file) > 0: + file_name = check_and_remove_base_file(coverage_file) + os.symlink(f"../{coverage_file}", f"{file_name}") + + if len(contig_coverage) > 0: + file_name = check_and_remove_base_file(contig_coverage) + os.symlink(f"../{contig_coverage}", f"{file_name}") + + os.chdir('..') + if len(gtdbtk) > 0: + check_and_remove_base_file(output_taxonomy) + os.symlink(f"{os.path.dirname(gtdbtk)}", output_taxonomy) + if len(singlem) > 0: + check_and_remove_base_file(output_singlem) + os.symlink(f"{os.path.dirname(singlem)}", output_singlem) + + for f in glob.glob('data/binning_bams/*.ba*'): + os.remove(f) + + Path('bins/done').touch() diff --git a/aviary/modules/binning/scripts/finalise_stats.py b/aviary/modules/binning/scripts/finalise_stats.py index 0b519cf7..3b48d41d 100644 --- a/aviary/modules/binning/scripts/finalise_stats.py +++ b/aviary/modules/binning/scripts/finalise_stats.py @@ -55,9 +55,12 @@ def get_taxonomy(rename_columns="Bin Id"): taxa.append(df_arc) except (FileNotFoundError, IndexError) as e: pass - - taxa = pd.concat(taxa) - taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1) + + try: + taxa = pd.concat(taxa) + taxa.rename({'user_genome' : rename_columns}, inplace=True, axis=1) + except ValueError: + taxa = pd.DataFrame(columns=[rename_columns]) return taxa diff --git a/aviary/modules/processor.py b/aviary/modules/processor.py index 2523f0c3..bba9dd66 100644 --- a/aviary/modules/processor.py +++ b/aviary/modules/processor.py @@ -112,16 +112,32 @@ def __init__(self, self.refinery_max_iterations = args.refinery_max_iterations self.refinery_max_retries = args.refinery_max_retries self.skip_abundances = args.skip_abundances + self.skip_taxonomy = args.skip_taxonomy + self.skip_singlem = args.skip_singlem + if args.binning_only: + self.skip_abundances = True + self.skip_taxonomy = True + self.skip_singlem = True + self.binning_only = args.binning_only + + self.skip_binners = ["maxbin2", "concoct"] + if args.extra_binners: + for binner in args.extra_binners: + binner = binner.lower() + if binner == "maxbin" or binner == "maxbin2": + self.skip_binners.remove("maxbin2") + elif binner == "concoct": + self.skip_binners.remove("concoct") + else: + logging.warning(f"Unknown extra binner {binner} specified. Skipping...") - self.skip_binners = [] if args.skip_binners: for binner in args.skip_binners: + binner = binner.lower() if binner == "metabat": self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec", "metabat2"]) elif binner == "metabat1": self.skip_binners.extend(["metabat_sens", "metabat_ssens", "metabat_spec", "metabat_sspec"]) - elif binner == "maxbin": - self.skip_binners.append("maxbin2") else: self.skip_binners.append(binner) @@ -133,6 +149,9 @@ def __init__(self, self.refinery_max_retries = 3 self.skip_binners = ["none"] self.skip_abundances = False + self.binning_only = False + self.skip_taxonomy = False + self.skip_singlem = False try: self.assembly = args.assembly @@ -354,6 +373,9 @@ def make_config(self): conf["gsa_mappings"] = self.gsa_mappings conf["skip_binners"] = self.skip_binners conf["skip_abundances"] = self.skip_abundances + conf["skip_taxonomy"] = self.skip_taxonomy + conf["skip_singlem"] = self.skip_singlem + conf["binning_only"] = self.binning_only conf["semibin_model"] = self.semibin_model conf["refinery_max_iterations"] = self.refinery_max_iterations conf["refinery_max_retries"] = self.refinery_max_retries @@ -604,4 +626,4 @@ def fraction_to_percent(val): val = float(val) if val <= 1: return val * 100 - return val \ No newline at end of file + return val diff --git a/aviary/modules/quality_control/scripts/qc_short_reads.py b/aviary/modules/quality_control/scripts/qc_short_reads.py index 6a6b255a..f5c01b70 100755 --- a/aviary/modules/quality_control/scripts/qc_short_reads.py +++ b/aviary/modules/quality_control/scripts/qc_short_reads.py @@ -87,20 +87,49 @@ def combine_reads( # we've got to concatenate the files together if coassemble: - # reads 1 first - for reads_1, reads_2 in zip(short_reads_1, short_reads_2): - if reads_1 == "none" or reads_2 == "none": - continue - - if not os.path.exists(reads_1): - logf.write(f"Short read file {reads_1} does not exist\n") - exit(1) - - if not os.path.exists(reads_2): - logf.write(f"Short read file {reads_2} does not exist\n") - exit(1) + if "none" not in short_reads_2 and "none" not in short_reads_1: + for reads_1, reads_2 in zip(short_reads_1, short_reads_2): + if reads_1 == "none" or reads_2 == "none": + continue + + if not os.path.exists(reads_1): + logf.write(f"Short read file {reads_1} does not exist\n") + exit(1) + + if not os.path.exists(reads_2): + logf.write(f"Short read file {reads_2} does not exist\n") + exit(1) + + setup_interleave(reads_1, reads_2, output_fastq, logf) + elif "none" not in short_reads_1: - setup_interleave(reads_1, reads_2, output_fastq, logf) + gzip_output = False + if not output_fastq.endswith(".gz"): + output_fastq = output_fastq + '.gz' + + for reads_1 in short_reads_1: + if reads_1 == "none": + continue + + if not os.path.exists(reads_1): + logf.write(f"Short read file {reads_1} does not exist\n") + exit(1) + + cat_reads(reads_1, output_fastq, threads, log_file) + elif "none" not in short_reads_2: + gzip_output = False + if not output_fastq.endswith(".gz"): + output_fastq = output_fastq + '.gz' + + for reads_2 in short_reads_2: + if reads_2 == "none": + continue + + if not os.path.exists(reads_2): + logf.write(f"Short read file {reads_2} does not exist\n") + exit(1) + + cat_reads(reads_2, output_fastq, threads, log_file) else: @@ -287,13 +316,13 @@ def filter_illumina_reference( # run fastp for quality control of reads se1_string = None if "none" not in short_reads_1: - combine_reads(short_reads_1, ["none"], "data/short_reads.pre_qc.1.fastq.gz", coassemble, log, threads) se1_string = "data/short_reads.pre_qc.1.fastq.gz" + combine_reads(short_reads_1, ["none"], se1_string, coassemble, log, threads) se2_string = None if "none" not in short_reads_2: - combine_reads(["none"], short_reads_2, "data/short_reads.pre_qc.2.fastq.gz", coassemble, log, threads) se2_string = "data/short_reads.pre_qc.2.fastq.gz" + combine_reads(["none"], short_reads_2, se2_string, coassemble, log, threads) run_fastp( reads_1=se1_string, diff --git a/aviary/modules/strain_analysis/scripts/run_lorikeet.py b/aviary/modules/strain_analysis/scripts/run_lorikeet.py index d9aeb612..422d7fac 100755 --- a/aviary/modules/strain_analysis/scripts/run_lorikeet.py +++ b/aviary/modules/strain_analysis/scripts/run_lorikeet.py @@ -30,7 +30,7 @@ def run_lorikeet( else: short_reads = "" - lorikeet_cmd = f"lorikeet call -t {threads} -d {mag_directory} -x {mag_extension} -p {parallel_genomes} {short_reads} {long_reads} -o {output_directory} --do-not-call-svs --calculate-dnds --calculate-fst".split() + lorikeet_cmd = f"lorikeet call -t {threads} -d {mag_directory} -x {mag_extension} -P {parallel_genomes} {short_reads} {long_reads} -o {output_directory} --do-not-call-svs --calculate-dnds --calculate-fst".split() run(lorikeet_cmd) diff --git a/aviary/scripts/singlem_appraise.py b/aviary/scripts/singlem_appraise.py new file mode 100644 index 00000000..026d4b92 --- /dev/null +++ b/aviary/scripts/singlem_appraise.py @@ -0,0 +1,143 @@ +from subprocess import CalledProcessError, run, STDOUT +import os +from pathlib import Path +import logging +import gzip +import subprocess +import tempfile +import glob + + +class SingleMContainer: + def __init__(self, threads: int, output_dir: str, genomes: str, assembly: str, pipe_results: str, logf: str): + self.commands = [] + self.threads = threads + self.pipe_results = pipe_results + self.genomes = [] + # for each file ending in .fna in genomes folder, add to self.genomes + for file in os.listdir(genomes): + if file.endswith(".fna"): + self.genomes.append(f"{genomes}/{file}") + self.assembly = assembly + self.output_dir = output_dir + self.intermediate_dir = os.path.join(self.output_dir, "intermediate_genomes") + os.makedirs(self.output_dir, exist_ok=True) + os.makedirs(self.intermediate_dir, exist_ok=True) + self.logf = logf + self.process_queue = [] + + + def run(self): + with open(self.logf, "a") as logf: + logf.write("generating SingleM commands\n") + self.create_commands() + for command in self.commands: + logf.write(" ".join(command) + "\n") + logf.write("running SingleM commands\n") + self.run_commands(logf) + self.appraise_otu_tables(logf) + + def appraise_otu_tables(self, logf): + logf.write("Appraising SingleM otu tables\n") + genome_otu_tables = glob.glob(os.path.join(self.intermediate_dir, "*genome_single*.csv")) + assembly_otu_table = os.path.join(self.intermediate_dir, "metagenome.assembly_0_otu_table.csv") + appraise_cmd = f"singlem appraise --genome-otu-tables {' '.join(genome_otu_tables)} \ + --assembly-otu-table {assembly_otu_table} \ + --metagenome-otu-tables {self.pipe_results} \ + --plot {os.path.join(self.output_dir, 'singlem_appraise.svg')} \ + --output-binned-otu-table {os.path.join(self.output_dir, 'binned.otu_table.csv')} \ + --output-unbinned-otu-table {os.path.join(self.output_dir, 'unbinned.otu_table.csv')} \ + --output-assembled-otu-table {os.path.join(self.output_dir, 'assembled.otu_table.csv')}".split() + try: + # create output file: data/singlem_out/singlem_appraisal.tsv + output_file = os.path.join(self.output_dir, "singlem_appraisal.tsv") + with open(output_file, "w") as outf: + with open(self.logf, "a") as logf: + run(appraise_cmd, stdout=outf, stderr=logf) + Path("data/singlem_out/metagenome.combined_otu_table.csv").touch() + except CalledProcessError as e: + with open(log, "a") as logf: + logf.write(e) + logf.write("\nSingleM appraise failed. Exiting.\n") + Path("data/singlem_out/metagenome.combined_otu_table.csv").touch() + + def create_commands(self): + self._create_genome_commands() + self._create_assembly_commands() + + + def run_commands(self, logf): + process_index = 0 + for command in self.commands: + f = tempfile.TemporaryFile() + logf.write(f"running process {process_index} {' '.join(command)}\n") + p = subprocess.Popen(command, stdout=f, stderr=logf) + self.process_queue.append((p, f)) + process_index += 1 + if len(self.process_queue) >= self.threads: + self._check_processes(self.threads + 1, logf) + + # write how many processes are left + logf.write(f"waiting for {len(self.process_queue)} processes to finish\n") + while len(self.process_queue) > 0: + self._check_processes(0, logf) + + def _check_processes(self, max_processes: int, logf): + while len(self.process_queue) > max_processes: + for i, (p, f) in enumerate(self.process_queue): + if p.poll() is not None: + for line in f: + logf.write(line) + f.close() + self.process_queue.pop(i) + break + + + def _create_assembly_commands(self): + threads = max(self.threads // (len(self.genomes) + 1), 1) + command = f"singlem pipe --threads {threads} --genome-fasta-files {self.assembly} --otu-table {self.intermediate_dir}/metagenome.assembly_0_otu_table.csv".split() + self.commands.append(command) + + def _create_genome_commands(self): + threads = max(self.threads // (len(self.genomes) + 1), 1) + for i, genome in enumerate(self.genomes): + command = f"singlem pipe --threads {threads} --genome-fasta-files {genome} --otu-table {self.intermediate_dir}/metagenome.genome_single_{i}_otu_table.csv".split() + self.commands.append(command) + + +def run_singlem( + genomes_folder: str, + assembly: str, + pipe_results: str, + threads: int, + log: str, +): + output_dir = "data/singlem_out" + singlem_container = SingleMContainer(threads, output_dir, genomes_folder, assembly, pipe_results, log) + singlem_container.run() + +def valid_path(path: str) -> bool: + return os.path.exists(path) + +if __name__ == '__main__': + # check if SINGLEM_METAPACKAGE_PATH environment variable is set and path is valid + # if not then, error and exit + os.environ["SINGLEM_METAPACKAGE_PATH"] = snakemake.params.package_path + if "SINGLEM_METAPACKAGE_PATH" not in os.environ or not valid_path(os.environ["SINGLEM_METAPACKAGE_PATH"]): + raise ValueError("SINGLEM_METAPACKAGE_PATH environment variable not set. Please set using 'aviary configure' or manually. Exiting.") + + assembly = snakemake.input.assembly + genomes = snakemake.params.genomes_folder + pipe_results = snakemake.input.pipe_results + threads = snakemake.threads + log = snakemake.log[0] + + with open(log, "w") as logf: pass + + run_singlem( + genomes, + assembly, + pipe_results, + threads, + log, + ) diff --git a/aviary/scripts/singlem_reads.py b/aviary/scripts/singlem_reads.py index 1321f713..2682cd0d 100644 --- a/aviary/scripts/singlem_reads.py +++ b/aviary/scripts/singlem_reads.py @@ -1,71 +1,248 @@ from subprocess import CalledProcessError, run, STDOUT import os from pathlib import Path +import logging +import gzip +import subprocess +import tempfile +import glob + +class ReadContainer: + def __init__(self, long_reads, short_reads_1, short_reads_2): + if long_reads == "none": + long_reads = [] + + if short_reads_1 == "none": + short_reads_1 = [] + + if short_reads_2 == "none": + short_reads_2 = [] + + self.long_reads = long_reads + self.short_reads_1 = short_reads_1 + self.short_reads_2 = short_reads_2 + + def _check_paired_reads(self): + if len(self.short_reads_2) == 0: + return False + if len(self.short_reads_1) == 0: + return False + if len(self.short_reads_1) != len(self.short_reads_2): + logging.warning("Short read 1 and short read 2 are not the same length but paired reads assumed, please check input") + return False + return True + + def get_paired_reads(self): + if not self._check_paired_reads(): + return [] + + paired_reads = [] + for i in range(len(self.short_reads_1)): + paired_reads.append((self.short_reads_1[i], self.short_reads_2[i])) + return paired_reads + + def get_paired_read_count(self): + if not self._check_paired_reads() or self._check_interleaved(): + return 0 + return len(self.short_reads_1) + + def get_single_reads(self): + if self._check_paired_reads() or self._check_interleaved(): + return [] + return self.short_reads_1 if len(self.short_reads_1) > 0 else self.short_reads_2 + + def get_single_read_count(self): + if self._check_paired_reads() or self._check_interleaved(): + return 0 + return len(self.short_reads_1) if len(self.short_reads_1) > 0 else len(self.short_reads_2) + + def get_long_read_count(self): + return len(self.long_reads) + + def get_long_reads(self): + return self.long_reads + + def get_interleaved_reads(self): + if not self._check_interleaved() or self._check_paired_reads(): + return [] + return self.short_reads_1 if len(self.short_reads_1) > 0 else self.short_reads_2 + + def get_interleaved_read_count(self): + if not self._check_interleaved() or self._check_paired_reads(): + return 0 + return len(self.short_reads_1) if len(self.short_reads_1) > 0 else len(self.short_reads_2) + + def _check_interleaved(self): + if self._check_paired_reads(): + return False + if len(self.short_reads_1) == 0 and len(self.short_reads_2) == 0: + return False + + short_reads = self.short_reads_1 if len(self.short_reads_1) > 0 else self.short_reads_2 + # open the reads and check if they are interleaved + # if they are then return True + # if they are not then return False + + # check if first file is gzipped + interleaved = False + if short_reads[0].endswith(".gz"): + with gzip.open(short_reads[0], "rt") as read_file: + interleaved = self._forward_and_reverse_present(read_file) + else: + with open(short_reads[0], "r") as read_file: + interleaved = self._forward_and_reverse_present(read_file) + + return interleaved + + def _forward_and_reverse_present(self, file) -> bool: + read_pattern = [] + for i, line in enumerate(file): + if i > 32: + break + if i % 4 == 0: + if line.endswith("1"): + read_pattern.append(1) + elif line.endswith("2"): + read_pattern.append(2) + else: + read_pattern.append(0) + + + for r1, r2 in zip(read_pattern[::2], read_pattern[1::2]): + if r1 == 0 or r2 == 0: + return False + if r1 == r2: + return False + if r1 == 1 and r2 == 2: + return True + + return False + + def get_total_read_count(self): + return self.get_long_read_count() + self.get_single_read_count() + self.get_paired_read_count() + self.get_interleaved_read_count() + +class SingleMContainer: + def __init__(self, threads: int, output_dir: str, read_container: ReadContainer, logf: str): + self.commands = [] + self.threads = threads + self.read_container = read_container + self.output_dir = output_dir + self.intermediate_dir = os.path.join(self.output_dir, "intermediate") + os.makedirs(self.output_dir, exist_ok=True) + os.makedirs(self.intermediate_dir, exist_ok=True) + self.logf = logf + self.process_queue = [] + self.total_reads = self.read_container.get_total_read_count() + + + def run(self): + with open(self.logf, "a") as logf: + logf.write("generating SingleM commands\n") + self.create_commands() + for command in self.commands: + logf.write(" ".join(command) + "\n") + logf.write("running SingleM commands\n") + self.run_commands(logf) + self.combine_otu_tables(logf) + + def combine_otu_tables(self, logf): + logf.write("combining SingleM otu tables\n") + intermediate_otu_tables = glob.glob(os.path.join(self.intermediate_dir, "*.csv")) + summarise_cmd = f"singlem summarise --input-otu-tables {' '.join(intermediate_otu_tables)} --output-otu-table {os.path.join(self.output_dir, 'metagenome.combined_otu_table.csv')}".split() + try: + with open(self.logf, "a") as logf: + run(summarise_cmd, stdout=logf, stderr=STDOUT) + Path("data/singlem_out/metagenome.combined_otu_table.csv").touch() + except CalledProcessError as e: + with open(log, "a") as logf: + logf.write(e) + logf.write("\nSingleM summarise failed. Exiting.\n") + Path("data/singlem_out/metagenome.combined_otu_table.csv").touch() + + def create_commands(self): + self._create_longread_commands() + self._create_shortread_commands() + + + def run_commands(self, logf): + process_index = 0 + for command in self.commands: + f = tempfile.TemporaryFile() + p = subprocess.Popen(command, stdout=f, stderr=logf) + self.process_queue.append((p, f)) + process_index += 1 + if len(self.process_queue) >= self.threads: + self._check_processes(self.threads + 1, logf) + + # write how many processes are left + logf.write(f"waiting for {len(self.process_queue)} processes to finish\n") + while len(self.process_queue) > 0: + self._check_processes(0, logf) + + def _check_processes(self, max_processes: int, logf): + while len(self.process_queue) > max_processes: + for i, (p, f) in enumerate(self.process_queue): + if p.poll() is not None: + for line in f: + logf.write(line) + f.close() + self.process_queue.pop(i) + break + + + def _create_longread_commands(self): + threads = max(self.threads // self.total_reads, 1) + for i, long_read in enumerate(self.read_container.get_long_reads()): + command = f"singlem pipe --threads {threads} --sequences {long_read} --otu-table {self.intermediate_dir}/metagenome.longread_{i}_otu_table.csv".split() + self.commands.append(command) + + def _create_shortread_commands(self): + threads = max(self.threads // self.total_reads, 1) + for i, short_read in enumerate(self.read_container.get_single_reads()): + command = f"singlem pipe --threads {threads} --sequences {short_read} --otu-table {self.intermediate_dir}/metagenome.shortread_single_{i}_otu_table.csv".split() + self.commands.append(command) + + for i, (short_read_1, short_read_2) in enumerate(self.read_container.get_paired_reads()): + command = f"singlem pipe --threads {threads} --forward {short_read_1} --reverse {short_read_2} --otu-table {self.intermediate_dir}/metagenome.shortread_paired_{i}_otu_table.csv".split() + self.commands.append(command) + + for i, interleaved_read in enumerate(self.read_container.get_interleaved_reads()): + command = f"singlem pipe --threads {threads} --sequences {interleaved_read} --otu-table {self.intermediate_dir}/metagenome.shortread_interleaved_{i}_otu_table.csv".split() + self.commands.append(command) + def run_singlem( - long_reads, - short_reads_1, - short_reads_2, + read_container: ReadContainer, threads: int, log: str, ): - try: - os.mkdir("data/singlem_out/") - except OSError: - with open(log, "a") as logf: - logf.write("Using prexisting directory: data/singlem_out/\n") - - singlem_output_list = [] - if long_reads != "none": - singlem_pipe_cmd = f"singlem pipe --threads {threads} --sequences {' '.join(long_reads)} --otu-table data/singlem_out/metagenome.longread_otu_table.csv".split() - with open(log, "a") as logf: - run(singlem_pipe_cmd, stdout=logf, stderr=STDOUT) - - if short_reads_2 != "none": - singlem_pipe_cmd = f"singlem pipe --threads {threads} --forward {' '.join(short_reads_1)} --reverse {' '.join(short_reads_2)} --otu-table data/singlem_out/metagenome.shortread_otu_table.csv".split() - with open(log, "a") as logf: - run(singlem_pipe_cmd, stdout=logf, stderr=STDOUT) - - elif short_reads_1 != "none": - singlem_pipe_cmd = f"singlem pipe --threads {threads} --sequences {' '.join(short_reads_1)} --otu-table data/singlem_out/metagenome.shortread_otu_table.csv".split() - with open(log, "a") as logf: - run(singlem_pipe_cmd, stdout=logf, stderr=STDOUT) - - - # if file exists then add it to otu output list - if os.path.exists("data/singlem_out/metagenome.longread_otu_table.csv"): - singlem_output_list.append("data/singlem_out/metagenome.longread_otu_table.csv") - - if os.path.exists("data/singlem_out/metagenome.shortread_otu_table.csv"): - singlem_output_list.append("data/singlem_out/metagenome.shortread_otu_table.csv") - - summarise_cmd = f"singlem summarise --input-otu-tables {' '.join(singlem_output_list)} --output-otu-table data/singlem_out/metagenome.combined_otu_table.csv".split() - - try: - with open(log, "a") as logf: - run(summarise_cmd, stdout=logf, stderr=STDOUT) - Path("data/singlem_out/metagenome.combined_otu_table.csv").touch() - except CalledProcessError as e: - with open(log, "a") as logf: - logf.write(e) - logf.write("\nSingleM summarise failed. Exiting.\n") - Path("data/singlem_out/metagenome.combined_otu_table.csv").touch() + output_dir = "data/singlem_out" + singlem_container = SingleMContainer(threads, output_dir, read_container, log) + singlem_container.run() +def valid_path(path: str) -> bool: + return os.path.exists(path) if __name__ == '__main__': + # check if SINGLEM_METAPACKAGE_PATH environment variable is set and path is valid + # if not then, error and exit + os.environ["SINGLEM_METAPACKAGE_PATH"] = snakemake.params.package_path + if "SINGLEM_METAPACKAGE_PATH" not in os.environ or not valid_path(os.environ["SINGLEM_METAPACKAGE_PATH"]): + raise ValueError("SINGLEM_METAPACKAGE_PATH environment variable not set. Please set using 'aviary configure' or manually. Exiting.") + long_reads = snakemake.config['long_reads'] short_reads_1 = snakemake.config['short_reads_1'] short_reads_2 = snakemake.config['short_reads_2'] pplacer_threads = snakemake.threads log = snakemake.log[0] + read_container = ReadContainer(long_reads, short_reads_1, short_reads_2) + with open(log, "w") as logf: pass run_singlem( - long_reads, - short_reads_1, - short_reads_2, + read_container, pplacer_threads, log, ) diff --git a/test/test_integration.py b/test/test_integration.py index 4f868494..687143dd 100755 --- a/test/test_integration.py +++ b/test/test_integration.py @@ -96,12 +96,12 @@ def test_short_read_recovery(self): self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/data/final_contigs.fasta")) self.assertTrue(os.path.islink(f"{output_dir}/aviary_out/assembly/final_contigs.fasta")) - self.assertTrue(os.path.islink(f"{output_dir}/aviary_out/diversity/singlem_out")) - self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_out/metagenome.combined_otu_table.csv")) - self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/singlem_out/metagenome.combined_otu_table.csv") > 0) - self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_out/singlem_appraisal.tsv")) - self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/singlem_out/singlem_appraisal.tsv") > 0) - self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_out/singlem_appraise.svg")) + self.assertTrue(os.path.islink(f"{output_dir}/aviary_out/diversity")) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/metagenome.combined_otu_table.csv")) + self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/metagenome.combined_otu_table.csv") > 0) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_appraisal.tsv")) + self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/singlem_appraisal.tsv") > 0) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_appraise.svg")) def test_long_read_recovery(self): output_dir = os.path.join("example", "test_long_read_recovery") @@ -123,12 +123,12 @@ def test_long_read_recovery(self): self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/data/final_contigs.fasta")) self.assertTrue(os.path.islink(f"{output_dir}/aviary_out/assembly/final_contigs.fasta")) - self.assertTrue(os.path.islink(f"{output_dir}/aviary_out/diversity/singlem_out")) - self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_out/metagenome.combined_otu_table.csv")) - self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/singlem_out/metagenome.combined_otu_table.csv") > 0) - self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_out/singlem_appraisal.tsv")) - self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/singlem_out/singlem_appraisal.tsv") > 0) - self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_out/singlem_appraise.svg")) + self.assertTrue(os.path.islink(f"{output_dir}/aviary_out/diversity")) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/metagenome.combined_otu_table.csv")) + self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/metagenome.combined_otu_table.csv") > 0) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_appraisal.tsv")) + self.assertTrue(os.path.getsize(f"{output_dir}/aviary_out/diversity/singlem_appraisal.tsv") > 0) + self.assertTrue(os.path.isfile(f"{output_dir}/aviary_out/diversity/singlem_appraise.svg")) def test_short_read_recovery_fast(self): output_dir = os.path.join("example", "test_short_read_recovery_fast") @@ -139,8 +139,8 @@ def test_short_read_recovery_fast(self): f"-o {output_dir}/aviary_out " f"-1 {data}/wgsim.1.fq.gz " f"-2 {data}/wgsim.2.fq.gz " - f"--skip-abundances " - f"--skip-binners concoct rosella vamb maxbin2 metabat " + f"--binning-only " + f"--skip-binners rosella vamb metabat " f"--skip-qc " f"--refinery-max-iterations 0 " f"--conda-prefix {path_to_conda} " diff --git a/test/test_recover.py b/test/test_recover.py index cf21cbd7..d1775d0a 100644 --- a/test/test_recover.py +++ b/test/test_recover.py @@ -40,11 +40,11 @@ def test_recover_simple_inputs(self): self.assertTrue("metabat_ssens" in output) self.assertTrue("metabat_sspec" in output) self.assertTrue("metabat2" in output) - self.assertTrue("maxbin2" in output) + self.assertTrue("maxbin2" not in output) self.assertTrue("rosella" in output) self.assertTrue("semibin" in output) self.assertTrue("vamb" in output) - self.assertTrue("concoct" in output) + self.assertTrue("concoct" not in output) self.assertTrue("das_tool" in output) # Refinery @@ -82,7 +82,7 @@ def test_recover_skip_binners(self): f"-2 {REVERSE_READS} " f"--output {tmpdir}/test " f"--conda-prefix {path_to_conda} " - f"--skip-binners metabat concoct " + f"--skip-binners metabat " f"--dryrun --tmpdir {tmpdir} " f"--snakemake-cmds \" --quiet\" " ) @@ -96,7 +96,7 @@ def test_recover_skip_binners(self): self.assertTrue("metabat_ssens" not in output) self.assertTrue("metabat_sspec" not in output) self.assertTrue("metabat2" not in output) - self.assertTrue("maxbin2" in output) + self.assertTrue("maxbin2" not in output) self.assertTrue("rosella" in output) self.assertTrue("semibin" in output) self.assertTrue("vamb" in output) @@ -133,7 +133,7 @@ def test_recover_no_singlem(self): f"EGGNOG_DATA_DIR=. " f"SINGLEM_METAPACKAGE_PATH=. " f"aviary recover " - f"--workflow recover_mags_no_singlem " + f"--skip-singlem " f"--assembly {ASSEMBLY} " f"-1 {FORWARD_READS} " f"-2 {REVERSE_READS} " @@ -152,11 +152,11 @@ def test_recover_no_singlem(self): self.assertTrue("metabat_ssens" in output) self.assertTrue("metabat_sspec" in output) self.assertTrue("metabat2" in output) - self.assertTrue("maxbin2" in output) + self.assertTrue("maxbin2" not in output) self.assertTrue("rosella" in output) self.assertTrue("semibin" in output) self.assertTrue("vamb" in output) - self.assertTrue("concoct" in output) + self.assertTrue("concoct" not in output) self.assertTrue("das_tool" in output) # Refinery @@ -176,7 +176,7 @@ def test_recover_no_singlem(self): self.assertTrue("singlem_pipe_reads" not in output) self.assertTrue("singlem_appraise" not in output) self.assertTrue("finalise_stats" in output) - self.assertTrue("recover_mags_no_singlem" in output) + self.assertTrue("recover_mags" in output) # Unnecessary self.assertTrue("complete_assembly_with_qc" not in output) @@ -208,11 +208,11 @@ def test_recover_no_abundances(self): self.assertTrue("metabat_ssens" in output) self.assertTrue("metabat_sspec" in output) self.assertTrue("metabat2" in output) - self.assertTrue("maxbin2" in output) + self.assertTrue("maxbin2" not in output) self.assertTrue("rosella" in output) self.assertTrue("semibin" in output) self.assertTrue("vamb" in output) - self.assertTrue("concoct" in output) + self.assertTrue("concoct" not in output) self.assertTrue("das_tool" in output) # Refinery @@ -237,6 +237,118 @@ def test_recover_no_abundances(self): # Unnecessary self.assertTrue("complete_assembly_with_qc" not in output) + def test_recover_no_taxonomy(self): + with tempfile.TemporaryDirectory() as tmpdir: + cmd = ( + f"GTDBTK_DATA_PATH=. " + f"CHECKM2DB=. " + f"EGGNOG_DATA_DIR=. " + f"SINGLEM_METAPACKAGE_PATH=. " + f"aviary recover " + f"--skip-taxonomy " + f"--assembly {ASSEMBLY} " + f"-1 {FORWARD_READS} " + f"-2 {REVERSE_READS} " + f"--output {tmpdir}/test " + f"--conda-prefix {path_to_conda} " + f"--dryrun --tmpdir {tmpdir} " + f"--snakemake-cmds \" --quiet\" " + ) + output = extern.run(cmd) + + # Binners + self.assertTrue("prepare_binning_files" in output) + self.assertTrue("get_bam_indices" in output) + self.assertTrue("metabat_sens" in output) + self.assertTrue("metabat_spec" in output) + self.assertTrue("metabat_ssens" in output) + self.assertTrue("metabat_sspec" in output) + self.assertTrue("metabat2" in output) + self.assertTrue("maxbin2" not in output) + self.assertTrue("rosella" in output) + self.assertTrue("semibin" in output) + self.assertTrue("vamb" in output) + self.assertTrue("concoct" not in output) + self.assertTrue("das_tool" in output) + + # Refinery + self.assertTrue("checkm_metabat2" in output) + self.assertTrue("refine_metabat2" in output) + self.assertTrue("checkm_rosella" in output) + self.assertTrue("refine_rosella" in output) + self.assertTrue("checkm_semibin" in output) + self.assertTrue("refine_semibin" in output) + self.assertTrue("checkm_das_tool" in output) + self.assertTrue("refine_dastool" in output) + + # Extras + self.assertTrue("checkm2" in output) + self.assertTrue("gtdbtk" not in output) + self.assertTrue("get_abundances" in output) + self.assertTrue("singlem_pipe_reads" in output) + self.assertTrue("singlem_appraise" in output) + self.assertTrue("finalise_stats" in output) + self.assertTrue("recover_mags" in output) + + # Unnecessary + self.assertTrue("complete_assembly_with_qc" not in output) + + def test_recover_binning_only(self): + with tempfile.TemporaryDirectory() as tmpdir: + cmd = ( + f"GTDBTK_DATA_PATH=. " + f"CHECKM2DB=. " + f"EGGNOG_DATA_DIR=. " + f"SINGLEM_METAPACKAGE_PATH=. " + f"aviary recover " + f"--binning-only " + f"--assembly {ASSEMBLY} " + f"-1 {FORWARD_READS} " + f"-2 {REVERSE_READS} " + f"--output {tmpdir}/test " + f"--conda-prefix {path_to_conda} " + f"--dryrun --tmpdir {tmpdir} " + f"--snakemake-cmds \" --quiet\" " + ) + output = extern.run(cmd) + + # Binners + self.assertTrue("prepare_binning_files" in output) + self.assertTrue("get_bam_indices" in output) + self.assertTrue("metabat_sens" in output) + self.assertTrue("metabat_spec" in output) + self.assertTrue("metabat_ssens" in output) + self.assertTrue("metabat_sspec" in output) + self.assertTrue("metabat2" in output) + self.assertTrue("maxbin2" not in output) + self.assertTrue("rosella" in output) + self.assertTrue("semibin" in output) + self.assertTrue("vamb" in output) + self.assertTrue("concoct" not in output) + self.assertTrue("das_tool" in output) + + # Refinery + self.assertTrue("checkm_metabat2" in output) + self.assertTrue("refine_metabat2" in output) + self.assertTrue("checkm_rosella" in output) + self.assertTrue("refine_rosella" in output) + self.assertTrue("checkm_semibin" in output) + self.assertTrue("refine_semibin" in output) + self.assertTrue("checkm_das_tool" in output) + self.assertTrue("refine_dastool" in output) + + # Extras + self.assertTrue("checkm2" in output) + self.assertTrue("gtdbtk" not in output) + self.assertTrue("get_abundances" not in output) + self.assertTrue("singlem_pipe_reads" not in output) + self.assertTrue("singlem_appraise" not in output) + self.assertTrue("finalise_stats" in output) + self.assertTrue("recover_mags" in output) + + # Unnecessary + self.assertTrue("complete_assembly_with_qc" not in output) + def test_recover_config(self): with tempfile.TemporaryDirectory() as tmpdir: cmd = (