diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 88e2f14..44b6775 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -10,17 +10,17 @@ on: branches_ignore: [] jobs: - Dry_Run_and_Lint: + Dry_Run_and_Lint_DNA&RNA: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: docker://snakemake/snakemake:latest - - name: Dry Run with test data + - name: Dry Run run: | docker run -v $PWD:/opt2 snakemake/snakemake:latest \ /opt2/metamorph run --input /opt2/.tests/test_cohort.txt \ --output /opt2/output --mode local --dry-run - - name: View the pipeline config file + - name: View the pipeline config file [DNA & RNA modes] run: | echo "Generated config file for pipeline...." && cat $PWD/output/config.json - name: Lint Workflow @@ -28,3 +28,31 @@ jobs: run: | docker run -v $PWD:/opt2 snakemake/snakemake:latest snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \ echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.' + Dry_Run_and_Lint_DNAonly1: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: docker://snakemake/snakemake:latest + - name: Dry Run [DNA only mode] + run: | + docker run -v $PWD:/opt2 snakemake/snakemake:latest \ + /opt2/metamorph run --input /opt2/.tests/test_cohort_DNAonly.txt \ + --output /opt2/output --mode local --dry-run + - name: View the pipeline config file [DNA only mode] + run: | + echo "Generated config file for pipeline...." && cat $PWD/output/config.json + - name: Lint Workflow + continue-on-error: true + run: | + docker run -v $PWD:/opt2 snakemake/snakemake:latest snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \ + echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.' + Dry_Run_DNA2: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: docker://snakemake/snakemake:latest + - name: Dry Run [DNA only mode w RNA column] + run: | + docker run -v $PWD:/opt2 snakemake/snakemake:latest \ + /opt2/metamorph run --input /opt2/.tests/test_cohort_DNAonly_RNAcolexists.txt \ + --output /opt2/output --mode local --dry-run diff --git a/.tests/test_cohort_DNAonly.txt b/.tests/test_cohort_DNAonly.txt new file mode 100644 index 0000000..a40fe5f --- /dev/null +++ b/.tests/test_cohort_DNAonly.txt @@ -0,0 +1,5 @@ +DNA +/opt2/.tests/WT_S1_R1.fastq.gz +/opt2/.tests/WT_S1_R2.fastq.gz +/opt2/.tests/WT_S2_R1.fastq.gz +/opt2/.tests/WT_S2_R2.fastq.gz diff --git a/.tests/test_cohort_DNAonly_RNAcolexists.txt b/.tests/test_cohort_DNAonly_RNAcolexists.txt new file mode 100644 index 0000000..e4a4d30 --- /dev/null +++ b/.tests/test_cohort_DNAonly_RNAcolexists.txt @@ -0,0 +1,5 @@ +DNA RNA +/opt2/.tests/WT_S1_R1.fastq.gz +/opt2/.tests/WT_S1_R2.fastq.gz +/opt2/.tests/WT_S2_R1.fastq.gz +/opt2/.tests/WT_S2_R2.fastq.gz diff --git a/config/cluster.json b/config/cluster.json index c472956..ce849f6 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -21,8 +21,8 @@ "gres": "lscratch:400" }, "metawrap_genome_assembly": { - "threads": 128, - "mem": "128G", + "threads": 56, + "mem": "240G", "partition": "norm", "time": "10-00:00:00" }, @@ -36,7 +36,8 @@ "threads": 32, "mem": "64G", "partition": "norm", - "time": "2-00:00:00" + "time": "2-00:00:00", + "gres": "lscratch:600" }, "derep_bins": { "threads": 32, @@ -52,8 +53,8 @@ "time": "2-00:00:00" }, "bbtools_index_map": { - "threads": 48, - "mem": "64G", + "threads": 28, + "mem": "220G", "partition": "norm", "time": "2-00:00:00" }, @@ -92,10 +93,10 @@ }, "rna_humann_classify": { "threads": 32, - "mem": "64G", + "mem": "128G", "partition": "norm", "time": "10-00:00:00", - "gres": "lscratch:600" + "gres": "lscratch:800" }, "map_to_rna_to_mag": { "threads": 32, diff --git a/config/images.json b/config/images.json index 90c652c..fa6d9d4 100644 --- a/config/images.json +++ b/config/images.json @@ -1,8 +1,8 @@ { "images": { - "metawrap": "docker://rroutsong/metamorph_metawrap:0.0.9" + "metawrap": "docker://rroutsong/metamorph_metawrap:0.0.11" }, "containers": { - "metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.9.sif" + "metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.11.sif" } } diff --git a/config/resources.json b/config/resources.json index 95cfa5e..eb9fb30 100644 --- a/config/resources.json +++ b/config/resources.json @@ -53,37 +53,37 @@ }, { "name": "CAT_db", "to": "/data2/CAT", - "from": "/vf/users/OpenOmics/references/metamorph/CAT_pack_prepare/cat_db/db", + "from": "/data/OpenOmics/references/metamorph/CAT_pack_prepare/db", "mode": "rw" }, { "name": "CAT_tax", "to": "/data2/CAT_tax", - "from": "/vf/users/OpenOmics/references/metamorph/CAT_pack_prepare/cat_db/db", + "from": "/data/OpenOmics/references/metamorph/CAT_pack_prepare/tax", "mode": "rw" }, { "name": "humann3_protein_db", "to": "/data2/uniref", - "from": "/vf/users/OpenOmics/references/metamorph/uniref", + "from": "/data/OpenOmics/references/metamorph/uniref", "mode": "rw" }, { "name": "humann3_nuc_db", "to": "/data2/chocophlan", - "from": "/vf/users/OpenOmics/references/metamorph/chocophlan", + "from": "/data/OpenOmics/references/metamorph/chocophlan", "mode": "rw" }, { "name": "humann3_config", "to": "/opt/conda/envs/bb3/lib/python3.7/site-packages/humann", - "from": "/vf/users/OpenOmics/references/metamorph/humann", + "from": "/data/OpenOmics/references/metamorph/humann", "mode": "rw" }, { "name": "utility_mapping", "to": "/data2/um", - "from": "/vf/users/OpenOmics/references/metamorph/utility_mapping", + "from": "/data/OpenOmics/references/metamorph/utility_mapping", "mode": "rw" }, { "name": "metaphlan_db", "to": "/data2/metaphlan", - "from": "/vf/users/OpenOmics/references/metamorph/metaphlan", + "from": "/data/OpenOmics/references/metamorph/metaphlan", "mode": "rw" } ] diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 0588984..2acd134 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -18,7 +18,7 @@ RUN mamba install -y -n bb3 -c biobakery humann RUN mkdir /install; cd /install; wget https://carlowood.github.io/which/which-2.21.tar.gz; tar xvf which-2.21.tar.gz RUN cd /install/which-2.21; ./configure; make && make install RUN rm /usr/bin/which; ln -s /usr/local/bin/which /usr/bin/which -RUN cd /home; git clone https://github.com/dutilh/CAT.git; chmod +x CAT_pack/CAT_pack; ln -s /home/CAT/CAT_pack/CAT_pack /usr/bin/CAT; chmod +x /usr/bin/CAT +RUN cd /home; git clone https://github.com/rroutsong/CAT_pack.git; ln -s /home/CAT_pack/CAT_pack/CAT_pack /usr/bin/CAT RUN cd /install; wget https://github.com/hyattpd/Prodigal/archive/refs/tags/v2.6.3.tar.gz; tar xvf v2.6.3.tar.gz RUN cd /install/Prodigal-2.6.3; make install INSTALLDIR=/usr/bin RUN cd /install; wget https://github.com/bbuchfink/diamond/archive/refs/tags/v2.1.9.tar.gz; tar xvf v2.1.9.tar.gz diff --git a/metamorph b/metamorph index 1ee853f..cf91ed3 100755 --- a/metamorph +++ b/metamorph @@ -57,7 +57,8 @@ from src.utils import ( exists, fatal, check_cache, - require + require, + permissions ) @@ -114,10 +115,12 @@ def run(sub_args): # Step 1. Parse input sample sheet in down stream # data structures consistent with argparse parser - validated, sample_map = valid_input(sub_args.input) + validated, rna_included = valid_input(sub_args.input) + delattr(sub_args, 'input') setattr(sub_args, 'input', [row['DNA'] for row in validated]) - setattr(sub_args, 'rna', [row['RNA'] for row in validated]) + if rna_included: + setattr(sub_args, 'rna', [row['RNA'] for row in validated]) # Step 2. Initialize working directory, # copy over required resources to run @@ -125,8 +128,8 @@ def run(sub_args): git_repo = __home__ fastq_inputs = [sub_args.input] - - if sub_args.rna: + + if getattr(sub_args, 'rna', False): fastq_inputs.append(sub_args.rna) input_files = init( @@ -142,7 +145,8 @@ def run(sub_args): config = setup(sub_args, ifiles = input_files, repo_path = git_repo, - output_path = sub_args.output + output_path = sub_args.output, + rna=rna_included ) # Step 4. Resolve docker/singularity bind diff --git a/src/run.py b/src/run.py index baf6718..36c54df 100644 --- a/src/run.py +++ b/src/run.py @@ -67,7 +67,7 @@ def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'c try: os.mkdir(os.path.join(output_path, 'rna')) except FileExistsError: - pass + pass inputs['rna'] = sym_safe(input_data = links[1], target = os.path.join(output_path, 'rna')) return inputs @@ -112,7 +112,7 @@ def sym_safe(input_data, target): if not exists(renamed): # Create a symlink if it does not already exist # Follow source symlinks to resolve any binding issues - os.symlink(os.path.abspath(os.path.realpath(file)), renamed) + os.symlink(os.path.abspath(file), renamed) return input_fastqs @@ -182,7 +182,7 @@ def get_sid(filepath): return -def setup(sub_args, ifiles, repo_path, output_path): +def setup(sub_args, ifiles, repo_path, output_path, rna=True): """Setup the pipeline for execution and creates config file from templates @param sub_args : Parsed arguments for run sub-command @@ -238,13 +238,16 @@ def setup(sub_args, ifiles, repo_path, output_path): config['options'][opt] = v # RNA -> DNA mapping - sample_map = {} - dna_files, rna_files = ifiles['dna'], ifiles['rna'] - for i in range(len(dna_files)): - r_sid = get_sid(rna_files[i]) - d_sid = get_sid(dna_files[i]) - sample_map[r_sid] = d_sid - config['sample_map'] = sample_map + if rna: + sample_map = {} + dna_files, rna_files = ifiles['dna'], ifiles['rna'] + for i in range(len(dna_files)): + r_sid = get_sid(rna_files[i]) + d_sid = get_sid(dna_files[i]) + sample_map[r_sid] = d_sid + config['sample_map'] = sample_map + else: + config['rna'] = False return config @@ -339,7 +342,7 @@ def bind(sub_args, config): if 'databases' in config: dbs = config.pop('databases') - bindpaths.extend([resolve(mount['from'])+':'+resolve(mount['to'])+':'+mount['mode'] for mount in dbs]) + bindpaths.extend([resolve(mount['from'])+':'+mount['to']+':'+mount['mode'] for mount in dbs]) if 'options' in config and 'input' in config['options']: inrents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['input'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))])) @@ -351,7 +354,11 @@ def bind(sub_args, config): if 'options' in config and 'rna' in config['options']: rnarents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['rna'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))])) - bindpaths.extend(rnarents) + common_parent = longest_common_parent_path(rnarents) + if common_parent: + bindpaths.extend([common_parent]) + else: + bindpaths.extend(rnarents) if 'options' in config and 'output' in config['options']: if os.path.exists(config['options']['output']) and os.path.isdir(config['options']['output']): @@ -360,8 +367,8 @@ def bind(sub_args, config): if 'tmp_dir' in config: bindpaths.append(config['tmp_dir']) - rawdata_bind_paths = [os.path.abspath(p) for p in config['project']['datapath'].split(',')] - working_directory = os.path.realpath(config['project']['workpath']) + # rawdata_bind_paths = [os.path.abspath(p) for p in config['project']['datapath'].split(',')] + # working_directory = os.path.realpath(config['project']['workpath']) return list(set(bindpaths)) @@ -486,7 +493,7 @@ def add_rawdata_information(sub_args, config, ifiles): # Finds the set of rawdata directories to bind config['project']['datapath'] = ','.join(get_rawdata_bind_paths(input_files = sub_args.input)) - if sub_args.rna: + if getattr(sub_args, 'rna', False): config["project"]["rna_datapath"] = ','.join(get_rawdata_bind_paths(input_files = sub_args.rna)) # Add each sample's basename @@ -613,7 +620,7 @@ def get_rawdata_bind_paths(input_files): bindpaths = [] for file in input_files: # Get directory of input file - rawdata_src_path = os.path.dirname(os.path.abspath(os.path.realpath(file))) + rawdata_src_path = os.path.dirname(os.path.abspath(file)) if rawdata_src_path not in bindpaths: bindpaths.append(rawdata_src_path) @@ -675,26 +682,33 @@ def valid_input(sheet): raise argparse.ArgumentTypeError(f"Path `{sheet}` exists, but cannot read path due to permissions!") # check format to make sure it's correct - sheet = open(sheet, 'r') - dialect = Sniffer().sniff(sheet.read(), [',', "\t"]) - sheet.seek(0) - rdr = DictReader(sheet, delimiter=dialect.delimiter) + if sheet.endswith('.tsv') or sheet.endswith('.txt'): + delim = '\t' + elif sheet.endswith('.csv'): + delim = '\t' + + rdr = DictReader(open(sheet, 'r'), delimiter=delim) + if 'DNA' not in rdr.fieldnames: raise argparse.ArgumentTypeError("Sample sheet does not contain `DNA` column") if 'RNA' not in rdr.fieldnames: - raise argparse.ArgumentTypeError("Sample sheet does not contain `RNA` column") - data = [row for row in rdr] + print("-- Running in DNA only mode --") + else: + print("-- Running in paired DNA & RNA mode --") - this_map = {} + data = [row for row in rdr] + RNA_included = False for row in data: - row['RNA'] = os.path.abspath(row['RNA']) row['DNA'] = os.path.abspath(row['DNA']) - if not os.path.exists(row['RNA']): - raise argparse.ArgumentTypeError(f"Sample sheet path `{row['RNA']}` does not exist") if not os.path.exists(row['DNA']): raise argparse.ArgumentTypeError(f"Sample sheet path `{row['DNA']}` does not exist") - - return data, this_map + if 'RNA' in row and not row['RNA'] in ('', None, 'None'): + RNA_included = True + row['RNA'] = os.path.abspath(row['RNA']) + if not os.path.exists(row['RNA']): + raise argparse.ArgumentTypeError(f"Sample sheet path `{row['RNA']}` does not exist") + + return data, RNA_included try: diff --git a/src/run.sh b/src/run.sh index 0a25b19..079e4be 100755 --- a/src/run.sh +++ b/src/run.sh @@ -240,7 +240,7 @@ snakemake \\ --jobs 500 \\ --keep-remote \\ --stats "$3/logfiles/runtime_statistics.json" \\ - --restart-times 0 \\ + --restart-times 1 \\ --keep-incomplete \\ --local-cores "14" 2>&1 # Create summary report diff --git a/src/utils.py b/src/utils.py index 5ae4120..37af5e1 100644 --- a/src/utils.py +++ b/src/utils.py @@ -46,11 +46,9 @@ def longest_common_parent_path(data): for val in data[1:]: s.intersection_update(substrs(val)) longest = max(s, key=len) - if longest.count('/') <= 3: + if longest.count('/') < 3: return None - if longest[-1] != '/': - longest = '/'.join(longest.split('/')[0:-1]) - return longest + return os.path.dirname(longest) def permissions(parser, path, *args, **kwargs): @@ -121,7 +119,7 @@ def ln(files, outdir): for file in files: ln = os.path.join(outdir, os.path.basename(file)) if not exists(ln): - os.symlink(os.path.abspath(os.path.realpath(file)), ln) + os.symlink(os.path.abspath(file), ln) def which(cmd, path=None): diff --git a/workflow/Snakefile b/workflow/Snakefile index 5c1b677..5a175c8 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -92,6 +92,16 @@ rule all: expand(join(top_refine_dir, "{name}", "dRep", "data_tables", "Widb.csv"), name=samples), expand(join(top_refine_dir, "{name}", "dRep", "figures", "Winning_genomes.pdf"), name=samples), expand(join(top_refine_dir, "{name}", "dRep", "log", "cluster_arguments.json"), name=samples), + # bin statistics + # ~~~~~~~~~~~~~~~ + expand(join(top_refine_dir, "{name}", "RefinedBins_summmary.txt"), name=samples), + expand(join(top_refine_dir, "{name}", "named_maxbin2_bins.stats"), name=samples), + expand(join(top_refine_dir, "{name}", "named_metabat2_bins.stats"), name=samples), + expand(join(top_refine_dir, "{name}", "named_metawrap_bins.stats"), name=samples), + join(top_refine_dir, "RefinedBins_summmary.txt"), + join(top_refine_dir, "cumulative_stats_maxbin.txt"), + join(top_refine_dir, "cumulative_stats_metabat2.txt"), + join(top_refine_dir, "cumulative_stats_metawrap.txt"), # contig annotation # ~~~~~~~~~~~~~~~ expand(join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.summary.txt"), name=samples), diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index 9cc5eca..2620e13 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -318,7 +318,7 @@ rule metawrap_binning: Orchestrates execution of this ensemble of metagenomic binning software: - MaxBin2 - - metaBAT2 + - metaBAT2cd - CONCOCT Stepwise algorithm for metawrap genome binning: @@ -370,12 +370,21 @@ rule metawrap_binning: bin_mem = mem2int(cluster['metawrap_binning'].get("mem", default_memory)), mw_trim_linker_R1 = join(top_trim_dir, "{name}", "{name}_1.fastq"), mw_trim_linker_R2 = join(top_trim_dir, "{name}", "{name}_2.fastq"), + tmp_bin_dir = join(config['options']['tmp_dir'], 'bin'), + tmp_binref_dir = join(config['options']['tmp_dir'], 'bin_rf'), min_perc_complete = "50", max_perc_contam = "5", singularity: metawrap_container, threads: int(cluster["metawrap_binning"].get("threads", default_threads)), shell: """ + # safe temp directory + if [ ! -d "{params.tmp_bin_dir}" ]; then mkdir -p "{params.tmp_bin_dir}"; fi + if [ ! -d "{params.tmp_binref_dir}" ]; then mkdir -p "{params.tmp_binref_dir}"; fi + tmp_bin=$(mktemp -d -p "{params.tmp_bin_dir}") + tmp_bin_ref=$(mktemp -d -p "{params.tmp_binref_dir}") + trap 'rm -rf "{params.tmp_bin_dir}" "{params.tmp_binref_dir}"' EXIT + # set checkm data checkm data setRoot /data2/CHECKM_DB export CHECKM_DATA_PATH="/data2/CHECKM_DB" @@ -399,14 +408,17 @@ rule metawrap_binning: # metawrap bin refinement mw bin_refinement \ - -o {params.bin_dir} \ + -o {params.tmp_binref_dir} \ + -m {params.bin_mem} \ -t {threads} \ -A {params.bin_dir}/metabat2_bins \ -B {params.bin_dir}/maxbin2_bins \ -c {params.min_perc_complete} \ -x {params.max_perc_contam} - + + cp -r {params.tmp_binref_dir}/* {params.bin_dir} touch {output.bin_breadcrumb} + chmod -R 775 {params.bin_dir} """ @@ -424,25 +436,24 @@ rule bin_stats: named_stats_metabat2 = join(top_refine_dir, "{name}", "named_metabat2_bins.stats"), named_stats_metawrap = join(top_refine_dir, "{name}", "named_metawrap_bins.stats"), params: + rname = "bin_stats", sid = "{name}", this_bin_dir = join(top_refine_dir, "{name}"), # count number of fasta files in the bin folders to get count of bins - metabat2_num_bins = lambda wc, _out, _in: str(len([fn for fn in os.listdir(_in.metabat2_bins) if "unbinned" not in fn.lower()])), - maxbin_num_bins = lambda wc, _out, _in: str(len([fn for fn in os.listdir(_in.maxbin_bins) if "unbinned" not in fn.lower()])), - metawrap_num_bins = lambda wc, _out, _in: str(len([fn for fn in os.listdir(_in.metawrap_bins) if "unbinned" not in fn.lower()])), + metabat2_num_bins = lambda wc, input: str(len([fn for fn in os.listdir(input.metabat2_bins) if "unbinned" not in fn.lower()])), + maxbin_num_bins = lambda wc, input: str(len([fn for fn in os.listdir(input.maxbin_bins) if "unbinned" not in fn.lower()])), + metawrap_num_bins = lambda wc, input: str(len([fn for fn in os.listdir(input.metawrap_bins) if "unbinned" not in fn.lower()])), shell: """ - # count cumulative lines starting with `>` metabat2_contigs=$(cat {input.metabat2_bins}/*fa | grep -c "^>") maxbin_configs=$(cat {input.maxbin_bins}/*fa | grep -c "^>") metawrap_contigs=$(cat {input.metawrap_bins}/*fa | grep -c "^>") echo "SampleID\tmetabat2_bins\tmaxbin2_bins\tmetaWRAP_50_5_bins\tmetabat2_contigs\tmaxbin2_contigs\tmetaWRAP_50_5_contigs" > {output.this_refine_summary} - echo "{params.sid}\t{params.metabat2_num_bins}\t{params.maxbin_num_bins}\t{params.metawrap_num_bins}\t$metabat2_contigs\t$maxbin_configs\t$metawrap_contigs" - + echo "{params.sid}\t{params.metabat2_num_bins}\t{params.maxbin_num_bins}\t{params.metawrap_num_bins}\t$metabat2_contigs\t$maxbin_configs\t$metawrap_contigs" >> {output.this_refine_summary} # name contigs with SID - cat {input.maxbin_stats} | sed 's/^bin./{name}_bin./g' > {output.named_stats_maxbin2} - cat {input.metabat2_stats} | sed 's/^bin./{name}_bin./g' > {output.named_stats_metabat2} - cat {input.metawrap_stats} | sed 's/^bin./{name}_bin./g' > {output.named_stats_metawrap} + cat {input.maxbin_stats} | sed 's/^bin./{params.sid}_bin./g' > {output.named_stats_maxbin2} + cat {input.metabat2_stats} | sed 's/^bin./{params.sid}_bin./g' > {output.named_stats_metabat2} + cat {input.metawrap_stats} | sed 's/^bin./{params.sid}_bin./g' > {output.named_stats_metawrap} """ @@ -458,24 +469,24 @@ rule cumulative_bin_stats: cumulative_metabat2_stats = join(top_refine_dir, "cumulative_stats_metabat2.txt"), cumulative_metawrap_stats = join(top_refine_dir, "cumulative_stats_metawrap.txt"), params: - bin_dir = top_binning_dir + rname = "cumulative_bin_stats", + refine_dir = top_refine_dir shell: """ # generate cumulative binning summary echo "SampleID\tmetabat2_bins\tmaxbin2_bins\tmetaWRAP_50_5_bins\tmetabat2_contigs\tmaxbin2_contigs\tmetaWRAP_50_5_contigs" > {output.cumulative_bin_summary} - for report in {params.bin_dir}/*/RefinedBins_summmary.txt; do + for report in `ls {params.refine_dir}/*/RefinedBins_summmary.txt`; do tail -n+2 $report >> {output.cumulative_bin_summary} - echo "tail -n+2 $report >> {output.cumulative_bin_summary}" done # generate cumulative statistic report for binning - for report in {params.bin_dir}/*/named_maxbin2_bins.stats; do + for report in `ls {params.refine_dir}/*/named_maxbin2_bins.stats`; do cat $report >> {output.cumulative_maxbin_stats} done - for report in {params.bin_dir}/*/named_maxbin2_bins.stats; do + for report in `ls {params.refine_dir}/*/named_maxbin2_bins.stats`; do cat $report >> {output.cumulative_metabat2_stats} done - for report in {params.bin_dir}/*/named_maxbin2_bins.stats; do + for report in `ls {params.refine_dir}/*/named_maxbin2_bins.stats`; do cat $report >> {output.cumulative_metawrap_stats} done @@ -541,6 +552,11 @@ rule derep_bins: # run drep export DREP_BINS=$(ls {params.metawrap_bins}/* | tr '\\n' ' ') + + printf "\ngenome list\n" + printf ${{DREP_BINS}} + printf "\n" + mkdir -p {params.derep_dir} dRep dereplicate -d \ -g ${{DREP_BINS}} \ @@ -562,15 +578,15 @@ rule contig_annotation: dRep_dir = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"), output: cat_bin2cls_filename = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.txt"), - cat_bing2cls_official = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.official_names.txt"), - cat_bing2cls_summary = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.summary.txt"), + cat_bin2cls_official = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.official_names.txt"), + cat_bin2cls_summary = join(top_refine_dir, "{name}", "contig_annotation", "out.BAT.bin2classification.summary.txt"), params: cat_dir = join(top_refine_dir, "{name}", "contig_annotation"), dRep_dir = join(top_refine_dir, "{name}", "dRep", "dereplicated_genomes"), rname = "contig_annotation", sid = "{name}", cat_db = "/data2/CAT", # from /config/resources.json - tax_db = "/data2/NCBI_TAX_DB", # from /config/resourcs.json + tax_db = "/data2/CAT_tax", # from /config/resourcs.json diamond_exec = "/usr/bin/diamond", threads: int(cluster["contig_annotation"].get("threads", default_threads)), singularity: metawrap_container, @@ -583,16 +599,17 @@ rule contig_annotation: -t {params.tax_db} \ --path_to_diamond {params.diamond_exec} \ --bin_suffix .fa \ + -f 0.5 \ -n {threads} \ --force cd {params.cat_dir} && CAT add_names \ -i {output.cat_bin2cls_filename} \ - -o {output.cat_bing2cls_official} \ + -o {output.cat_bin2cls_official} \ -t {params.tax_db} \ --only_official cd {params.cat_dir} && CAT summarise \ - -i {output.cat_bing2cls_official} \ - -o {output.cat_bing2cls_summary} + -i {output.cat_bin2cls_official} \ + -o {output.cat_bin2cls_summary} """ @@ -686,7 +703,7 @@ rule bbtools_index_map: # """ -rule gtdbk_classify: +rule gtdbtk_classify: input: R1 = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), R2 = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"), diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index fff9210..dcdfbb0 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -25,8 +25,8 @@ top_map_dir_rna = join(workpath, config['project']['id'], "m rule concat_rna_reads: input: - all_r1_reads = expand(join(workpath, "rna", "{rname}_R1.fastq.gz"), rname=rna_sample_stems if rna_coasm else []), - all_r2_reads = expand(join(workpath, "rna", "{rname}_R2.fastq.gz"), rname=rna_sample_stems if rna_coasm else []), + all_r1_reads = expand(join(workpath, "rna", "{rname}_R1.fastq.gz"), rname=rna_sample_stems if rna_coasm else []) if rna_included else [], + all_r2_reads = expand(join(workpath, "rna", "{rname}_R2.fastq.gz"), rname=rna_sample_stems if rna_coasm else []) if rna_included else [], output: big_compressed_read_r1 = join(workpath, "rna", "concatenated_R1.fastq.gz"), big_compressed_read_r2 = join(workpath, "rna", "concatenated_R2.fastq.gz"), @@ -69,8 +69,8 @@ rule concat_rna_reads: rule rna_read_qc: input: - R1 = join(workpath, "rna", "{rname}_R1.fastq.gz"), - R2 = join(workpath, "rna", "{rname}_R2.fastq.gz"), + R1 = join(workpath, "rna", "{rname}_R1.fastq.gz") if rna_included else [], + R2 = join(workpath, "rna", "{rname}_R2.fastq.gz") if rna_included else [], output: R1_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R1_pretrim_report.html"), R2_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R2_pretrim_report.html"), @@ -162,23 +162,23 @@ rule rna_humann_classify: trap 'rm -rf "{params.tmp_safe_dir}"' EXIT # human configuration - humann_config --update database_folders nucleotide {params.chocophlan_db} - humann_config --update database_folders protein {params.uniref_db} - humann_config --update database_folders utility_mapping {params.util_map_db} humann_config --print > {output.humann_config} # metaphlan configuration export DEFAULT_DB_FOLDER={params.metaphlan_db} - metaphlan --install --bowtie2db {params.metaphlan_db} # mapping execution + + # old method signature + # --metaphlan-options "--bowtie2db {params.metaphlan_db} --nproc {threads}" + cat {input.R1} {input.R2} > {params.tmpread} humann \ --threads {threads} \ --input {params.tmpread} \ --remove-temp-output \ --input-format fastq.gz \ - --metaphlan-options "--bowtie2db {params.metaphlan_db} --nproc {threads}" \ + --metaphlan-options "--bowtie2db {params.metaphlan_db}" \ --output-basename {params.sid} \ --log-level DEBUG \ --o-log {output.humann_log} \ @@ -188,9 +188,9 @@ rule rna_humann_classify: rule map_to_rna_to_mag: input: - dna_input = lambda wc: get_dna(wc.rname), - R1 = join(top_trim_dir_rna, "{rname}", "{rname}_R1_trimmed.fastq.gz"), - R2 = join(top_trim_dir_rna, "{rname}", "{rname}_R2_trimmed.fastq.gz"), + dna_input = lambda wc: get_dna(wc.rname) if rna_included else [], + R1 = join(top_trim_dir_rna, "{rname}", "{rname}_R1_trimmed.fastq.gz") if rna_included else [], + R2 = join(top_trim_dir_rna, "{rname}", "{rname}_R2_trimmed.fastq.gz") if rna_included else [], output: aligned_rna = join(top_map_dir_rna, "{rname}", "{rname}.RNA.aligned.sam"), statsfile = join(top_map_dir_rna, "{rname}", "{rname}.RNA.statsfile"), @@ -203,8 +203,8 @@ rule map_to_rna_to_mag: sid = "{rname}", map_rna_dir = join(top_map_dir_rna, "{rname}"), minid = "0.90", - mag_dir = lambda wc, input: input.dna_input[0], - mag_idx = lambda wc, input: input.dna_input[1], + mag_dir = lambda wc, input: input.dna_input[0] if input.dna_input else [], + mag_idx = lambda wc, input: input.dna_input[1] if input.dna_input else [], threads: int(cluster["map_to_rna_to_mag"].get('threads', default_threads)), containerized: metawrap_container, shell: diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 616ae68..e0d355a 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -6,10 +6,13 @@ def get_paired_dna(config, rna_sid): + if 'sample_map' not in config: + return None + if not list_bool(config.get("rna", 'false')): + return None sample_map = config['sample_map'] dna_sid = sample_map[rna_sid] - if not list_bool(config.get("rna", 'false')): - return [] + # dna directories top_refine_dir = join(config["project"]["workpath"], config['project']['id'], "metawrap_bin_refine") top_mags_dir = join(config["project"]["workpath"], config['project']['id'], "mags")