main.nf

#!/usr/bin/env nextflow
/*
========================================================================================
                         nf-core/mag
========================================================================================

nf-core/mag Analysis Pipeline. Started 2018-05-22.
#### Homepage / Documentation
https://github.com/nf-core/mag
#### Authors
Hadrien Gourlé HadrienG <hadrien.gourle@slu.se> - hadriengourle.com>
Daniel Straub <d4straub@gmail.com>
Sabrina Krakau <sabrinakrakau@gmail.com>
----------------------------------------------------------------------------------------
*/

def helpMessage() {
    log.info nfcoreHeader()
    log.info"""
    Usage:

    The typical command for running the pipeline is as follows:

    nextflow run nf-core/mag --input '*_R{1,2}.fastq.gz' -profile docker
    nextflow run nf-core/mag --manifest 'manifest.tsv' -profile docker

    Mandatory arguments:
      --input [file]                  Path to input data (must be surrounded with quotes)
      -profile [str]                  Configuration profile to use. Can use multiple (comma separated)
                                      Available: conda, docker, singularity, test, awsbatch, <institute> and more

    Hybrid assembly:
      --manifest [file]                     Path to manifest file (must be surrounded with quotes), required for hybrid assembly with metaSPAdes
                                            Has 4 headerless columns (tab separated): Sample_Id, Long_Reads, Short_Reads_1, Short_Reads_2
                                            Only one file path per entry allowed

    Options:
      --genome [str]                        Name of iGenomes reference
      --single_end [bool]                   Specifies that the input is single-end reads

    Other options:
      --outdir [file]                       The output directory where the results will be saved
      --publish_dir_mode [str]              Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy)
      --email [email]                       Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
      --email_on_fail [email]               Same as --email, except only send mail if the workflow is not successful
      --max_multiqc_email_size [str]        Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
      -name [str]                           Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic

    Short read preprocessing:
      --adapter_forward [str]               Sequence of 3' adapter to remove in the forward reads
      --adapter_reverse [str]               Sequence of 3' adapter to remove in the reverse reads
      --mean_quality [int]                  Mean qualified quality value for keeping read (default: 15)
      --trimming_quality [int]              Trimming quality value for the sliding window (default: 15)
      --host_genome [str]                   Name of iGenomes reference for host contamination removal (mutually exclusive with --host_fasta)
      --host_fasta [file]                   Fasta reference file for host contamination removal (mutually exclusive with --host_genome). Potentially masked.
      --host_removal_verysensitive [bool]   Use --very-sensitive setting (instead of --sensitive) for Bowtie 2 to map reads against host genome (default: false)
      --host_removal_save_ids [bool]        Save read ids of removed host reads (default: false)
      --keep_phix [bool]                    Keep reads similar to the Illumina internal standard PhiX genome (default: false)

    Long read preprocessing:
      --skip_adapter_trimming [bool]        Skip removing adapter sequences from long reads
      --longreads_min_length [int]          Discard any read which is shorter than this value (default: 1000)
      --longreads_keep_percent [float]      Keep this percent of bases (default: 90)
      --longreads_length_weight [float]     The higher the more important is read length when choosing the best reads (default: 10)
      --keep_lambda [bool]                  Keep reads similar to the ONT internal standard Escherichia virus Lambda genome (default: false)

    Assembly:
      --skip_spades [bool]                  Skip Illumina-only SPAdes assembly
      --skip_spadeshybrid [bool]            Skip SPAdes hybrid assembly (only available when using manifest input)
      --skip_megahit [bool]                 Skip MEGAHIT assembly
      --skip_quast [bool]                   Skip metaQUAST

    Taxonomy:
      --centrifuge_db [file]                Database for taxonomic binning with centrifuge (default: none). E.g. "ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz"
      --kraken2_db [file]                   Database for taxonomic binning with kraken2 (default: none). E.g. "ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz"
      --skip_krona [bool]                   Skip creating a krona plot for taxonomic binning
      --cat_db [file]                       Database for taxonomic classification of metagenome assembled genomes (default: none). E.g. "http://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20200618.tar.gz"
                                            The zipped file needs to contain a folder named "*taxonomy*" and "*CAT_database*" that hold the respective files.

    Binning options:
      --skip_binning [bool]                 Skip metagenome binning
      --min_contig_size [int]               Minimum contig size to be considered for binning and for bin quality check (default: 1500)
      --min_length_unbinned_contigs [int]   Minimal length of contigs that are not part of any bin but treated as individual genome (default: 1000000)
      --max_unbinned_contigs [int]          Maximal number of contigs that are not part of any bin but treated as individual genome (default: 100)

    Bin quality check:
      --skip_busco [bool]                   Disable bin QC with BUSCO (default: false)
      --busco_reference [file]              Download path for BUSCO database, available databases are listed here: https://busco.ezlab.org/
                                            (default: https://busco-data.ezlab.org/v4/data/lineages/bacteria_odb10.2020-03-06.tar.gz)
      --save_busco_reference [bool]         Save BUSCO reference. Useful to allow reproducibility, as BUSCO datasets are frequently updated and old versions do not always remain accessible.

    Reproducibility options:
      --megahit_fix_cpu_1 [bool]            Fix number of CPUs for MEGAHIT to 1. Not increased with retries (default: false)
      --spades_fix_cpus [int]               Fixed number of CPUs used by SPAdes. Not increased with retries (default: none)
      --spadeshybrid_fix_cpus [int]         Fixed number of CPUs used by SPAdes hybrid. Not increased with retries (default: none)
      --metabat_rng_seed [int]              RNG seed for MetaBAT2. Use postive integer to ensure reproducibility (default: 1). Set to 0 to use random seed.

    AWSBatch options:
      --awsqueue [str]                      The AWSBatch JobQueue that needs to be set when running on AWSBatch
      --awsregion [str]                     The AWS Region for your AWS Batch job to run on
      --awscli [str]                        Path to the AWS CLI tool
    """.stripIndent()
}

// Show help message
if (params.help) {
    helpMessage()
    exit 0
}

/*
 * SET UP CONFIGURATION VARIABLES
 */

// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
    custom_runName = workflow.runName
}

// Check AWS batch settings
if (workflow.profile.contains('awsbatch')) {
    // AWSBatch sanity checking
    if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
    // Check outdir paths to be S3 buckets if running on AWSBatch
    // related: https://github.com/nextflow-io/nextflow/issues/813
    if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
    // Prevent trace files to be stored on S3 since S3 does not support rolling files.
    if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles."
}

// Stage config files
ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true)
ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)
ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true)

// Check if specified cpus for SPAdes are available
if ( params.spades_fix_cpus && params.spades_fix_cpus > params.max_cpus )
    exit 1, "Invalid parameter '--spades_fix_cpus ${params.spades_fix_cpus}', max cpus are '${params.max_cpus}'."
if ( params.spadeshybrid_fix_cpus && params.spadeshybrid_fix_cpus > params.max_cpus )
    exit 1, "Invalid parameter '--spadeshybrid_fix_cpus ${params.spadeshybrid_fix_cpus}', max cpus are '${params.max_cpus}'."
// Check if settings concerning reproducibility of used tools are consistent and print warning if not
if (params.megahit_fix_cpu_1 || params.spades_fix_cpus || params.spadeshybrid_fix_cpus){
    if (!params.skip_spades && !params.spades_fix_cpus)
        log.warn "At least one assembly process is run with a parameter to ensure reproducible results, but SPAdes not. Consider using the parameter '--spades_fix_cpus'."
    if (params.manifest && !params.skip_spadeshybrid && !params.spadeshybrid_fix_cpus)
        log.warn "At least one assembly process is run with a parameter to ensure reproducible results, but SPAdes hybrid not. Consider using the parameter '--spadeshybrid_fix_cpus'."
    if (!params.skip_megahit && !params.megahit_fix_cpu_1)
        log.warn "At least one assembly process is run with a parameter to ensure reproducible results, but MEGAHIT not. Consider using the parameter '--megahit_fix_cpu_1'."
    if (!params.skip_binning && params.metabat_rng_seed == 0)
        log.warn "At least one assembly process is run with a parameter to ensure reproducible results, but for MetaBAT2 a random seed is specified ('--metabat_rng_seed 0'). Consider specifying a positive seed instead."
}

/*
 * Create a channel for reference databases
 */
if(!params.skip_busco){
    Channel
        .fromPath( "${params.busco_reference}", checkIfExists: true )
        .set { file_busco_db }
} else {
    file_busco_db = Channel.from()
}

if(params.centrifuge_db){
    Channel
        .fromPath( "${params.centrifuge_db}", checkIfExists: true )
        .set { file_centrifuge_db }
} else {
    file_centrifuge_db = Channel.from()
}

if(params.kraken2_db){
    Channel
        .fromPath( "${params.kraken2_db}", checkIfExists: true )
        .set { file_kraken2_db }
} else {
    file_kraken2_db = Channel.from()
}

if(params.cat_db){
    Channel
        .fromPath( "${params.cat_db}", checkIfExists: true )
        .set { file_cat_db }
} else {
    file_cat_db = Channel.from()
}

if(!params.keep_phix) {
    Channel
        .fromPath( "${params.phix_reference}", checkIfExists: true )
        .set { file_phix_db }
}

/*
 * Check if parameters for host contamination removal are valid and create channels
 */
if ( params.host_fasta && params.host_genome) {
    exit 1, "Both host fasta reference and iGenomes genome are specififed to remove host contamination! Invalid combination, please specify either --host_fasta or --host_genome."
}
if ( params.manifest && (params.host_fasta || params.host_genome) ) {
    log.warn "Host read removal is only applied to short reads. Long reads might be filtered indirectly by Filtlong, which is set to use read qualities estimated based on k-mer matches to the short, already filtered reads."
    if ( params.longreads_length_weight > 1 ) {
        log.warn "The parameter --longreads_length_weight is ${params.longreads_length_weight}, causing the read length being more important for long read filtering than the read quality. Set --longreads_length_weight to 1 in order to assign equal weights."
    }
}

if ( params.host_genome ) {
    // Check if host genome exists in the config file
    if ( !params.genomes.containsKey(params.host_genome) ) {
        exit 1, "The provided host genome '${params.host_genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
    } else {
        host_fasta = params.genomes[params.host_genome].fasta ?: false
        if ( !host_fasta ) {
            exit 1, "No fasta file specified for the host genome ${params.host_genome}!"
        }
        Channel
            .value(file( "${host_fasta}", checkIfExists: true ))
            .set { ch_host_fasta }

        host_bowtie2index = params.genomes[params.host_genome].bowtie2 ?: false
        if ( !host_bowtie2index ) {
            exit 1, "No Bowtie 2 index file specified for the host genome ${params.host_genome}!"
        }
        Channel
            .value(file( "${host_bowtie2index}/*", checkIfExists: true ))
            .set { ch_host_bowtie2index }
    }
} else if ( params.host_fasta ) {
    Channel
        .value(file( "${params.host_fasta}", checkIfExists: true ))
        .set { ch_host_fasta }
} else {
    ch_host_fasta = Channel.empty()
}

/*
 * Create a channel for input read files
 */

if(params.manifest){
    manifestFile = file(params.manifest)
    // extracts read files from TSV and distribute into channels
    Channel
        .from(manifestFile)
        .ifEmpty {exit 1, log.info "Cannot find path file ${tsvFile}"}
        .splitCsv(sep:'\t')
        .map { row ->
            def id = row[0]
            def lr = file(row[1], checkIfExists: true)
            def sr1 = file(row[2], checkIfExists: true)
            def sr2 = file(row[3], checkIfExists: true)
            [ id, lr, sr1, sr2 ]
            }
        .into { files_all_sr; files_all_lr }
    // prepare input for preprocessing
    files_all_sr
        .map { id, lr, sr1, sr2 -> [ id, [ sr1, sr2 ] ] }
        .into { read_files_fastqc; read_files_fastp }
    files_all_lr
        .map { id, lr, sr1, sr2 -> [ id, lr ] }
        .set { files_long_raw }
} else if(params.input_paths){
    if(params.single_end){
        Channel
            .from(params.input_paths)
            .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] }
            .ifEmpty { exit 1, "params.input_paths was empty - no input files supplied" }
            .into { read_files_fastqc; read_files_fastp }
        files_long_raw = Channel.from()
    } else {
        Channel
            .from(params.input_paths)
            .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] }
            .ifEmpty { exit 1, "params.input_paths was empty - no input files supplied" }
            .into { read_files_fastqc; read_files_fastp }
        files_long_raw = Channel.from()
    }
 } else {
    Channel
        .fromFilePairs(params.reads, size: params.single_end ? 1 : 2)
        .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." }
        .into { read_files_fastqc; read_files_fastp }
    files_long_raw = Channel.from()
}

// Header log info
log.info nfcoreHeader()
def summary = [:]
if (workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name'] = custom_runName ?: workflow.runName
if (params.input_paths) summary['Input paths']     = params.input_paths
else if (params.manifest) summary['Manifest']   = params.manifest
else summary['Input']                           = params.input
summary['Data Type']                  = params.single_end ? 'Single-End' : 'Paired-End'

summary['Adapter forward']            = params.adapter_forward
summary['Adapter reverse']            = params.adapter_reverse
summary['Mean quality']               = params.mean_quality
summary['Trimming quality']           = params.trimming_quality
summary['Keep phix reads']            = params.keep_phix ? 'Yes' : 'No'
if (!params.keep_phix) summary['PhiX reference']               = params.phix_reference
if (params.host_genome) summary['Host Genome']               = params.host_genome
else if(params.host_fasta) summary['Host Fasta Reference']   = params.host_fasta
if (params.host_genome || params.host_fasta) summary['Host removal setting'] = params.host_removal_verysensitive ? 'very-sensitive' : 'sensitive'

if (params.manifest) {
    summary['Skip adapter trimming']     = params.skip_adapter_trimming ? 'Yes' : 'No'
    summary['Keep lambda reads']         = params.keep_lambda ? 'Yes' : 'No'
    if (!params.keep_lambda) summary['Lambda reference']               = params.lambda_reference
    summary['Long reads min length']     = params.longreads_min_length
    summary['Long reads keep percent']   = params.longreads_keep_percent
    summary['Long reads length weight']  = params.longreads_length_weight
}

if(params.centrifuge_db) summary['Centrifuge Db']   = params.centrifuge_db
if(params.kraken2_db) summary['Kraken2 Db']         = params.kraken2_db
summary['Skip_krona']       = params.skip_krona ? 'Yes' : 'No'

summary['Skip binning']     = params.skip_binning ? 'Yes' : 'No'
if (!params.skip_binning) {
    summary['Min contig size']              = params.min_contig_size
    summary['Min length unbinned contigs']  = params.min_length_unbinned_contigs
    summary['Max unbinned contigs']         = params.max_unbinned_contigs
}
summary['Skip busco']           = params.skip_busco ? 'Yes' : 'No'
if(!params.skip_busco) summary['Busco Reference']   = params.busco_reference
summary['Skip spades']          = params.skip_spades ? 'Yes' : 'No'
summary['Skip spadeshybrid']    = params.skip_spadeshybrid ? 'Yes' : 'No'
summary['Skip megahit']         = params.skip_megahit ? 'Yes' : 'No'
summary['Skip quast']           = params.skip_quast ? 'Yes' : 'No'

if (!params.skip_megahit)                                               summary['MEGAHIT fix cpus']         = params.megahit_fix_cpu_1 ? '1' : 'No'
if (!params.single_end && !params.skip_spades)                          summary['SPAdes fix cpus']          = params.spades_fix_cpus ? params.spades_fix_cpus : 'No'
if (params.manifest && !params.single_end && !params.skip_spadeshybrid) summary['SPAdes hybrid fix cpus']   = params.spadeshybrid_fix_cpus ? params.spadeshybrid_fix_cpus : 'No'
if (!params.skip_binning)                                               summary['MetaBAT2 RNG seed']        = params.metabat_rng_seed

summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output dir']       = params.outdir
summary['Launch dir']       = workflow.launchDir
summary['Working dir']      = workflow.workDir
summary['Script dir']       = workflow.projectDir
summary['User']             = workflow.userName
if (workflow.profile.contains('awsbatch')) {
    summary['AWS Region']   = params.awsregion
    summary['AWS Queue']    = params.awsqueue
    summary['AWS CLI']      = params.awscli
}
summary['Config Profile'] = workflow.profile
if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description
if (params.config_profile_contact)     summary['Config Profile Contact']     = params.config_profile_contact
if (params.config_profile_url)         summary['Config Profile URL']         = params.config_profile_url
summary['Config Files'] = workflow.configFiles.join(', ')
if (params.email || params.email_on_fail) {
    summary['E-mail Address']    = params.email
    summary['E-mail on failure'] = params.email_on_fail
    summary['MultiQC maxsize']   = params.max_multiqc_email_size
}
log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"

// Check the hostnames against configured profiles
checkHostname()

Channel.from(summary.collect{ [it.key, it.value] })
    .map { k,v -> "<dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }
    .reduce { a, b -> return [a, b].join("\n            ") }
    .map { x -> """
    id: 'nf-core-mag-summary'
    description: " - this information is collected when the pipeline is started."
    section_name: 'nf-core/mag Workflow Summary'
    section_href: 'https://github.com/nf-core/mag'
    plot_type: 'html'
    data: |
        <dl class=\"dl-horizontal\">
            $x
        </dl>
    """.stripIndent() }
    .set { ch_workflow_summary }

/*
 * Parse software version numbers
 */

process get_busco_version {

    output:
    file "v_busco.txt" into ch_busco_version

    script:
    """
    busco --version > v_busco.txt
    """
}

process get_software_versions {
    publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode,
        saveAs: { filename ->
                      if (filename.indexOf(".csv") > 0) filename
                      else null
                }

    input:
    file(busco_version) from ch_busco_version

    output:
    file 'software_versions_mqc.yaml' into ch_software_versions_yaml
    file "software_versions.csv"

    script:
    """
    echo $workflow.manifest.version > v_pipeline.txt
    echo $workflow.nextflow.version > v_nextflow.txt
    multiqc --version > v_multiqc.txt
    fastqc --version > v_fastqc.txt
    fastp -v 2> v_fastp.txt
    megahit --version > v_megahit.txt
    metabat2 -h 2> v_metabat.txt || true
    NanoPlot --version > v_nanoplot.txt
    filtlong --version > v_filtlong.txt
    porechop --version > v_porechop.txt
    NanoLyse --version > v_nanolyse.txt
    spades.py --version > v_spades.txt
    centrifuge --version > v_centrifuge.txt
    kraken2 -v > v_kraken2.txt
    CAT -v > v_cat.txt
    quast -v > v_quast.txt

    scrape_software_versions.py > software_versions_mqc.yaml
    """
}

/*
================================================================================
                                Preprocessing and QC for short reads
================================================================================
*/

process fastqc_raw {
    tag "$name"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode,
        saveAs: {filename -> filename.indexOf(".zip") == -1 ? "QC_shortreads/fastqc/$filename" : null}

    input:
    set val(name), file(reads) from read_files_fastqc

    output:
    file "*_fastqc.{zip,html}" into fastqc_results

    script:
    """
    fastqc -t "${task.cpus}" -q $reads
    """
}

process fastp {
    tag "$name"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode,
        saveAs: {filename -> filename.indexOf(".fastq.gz") == -1 ? "QC_shortreads/fastp/$name/$filename" : null}

    input:
    set val(name), file(reads) from read_files_fastp
    val adapter from params.adapter_forward
    val adapter_reverse from params.adapter_reverse
    val qual from params.mean_quality
    val trim_qual from params.trimming_quality

    output:
    set val(name), file("${name}_trimmed*.fastq.gz") into trimmed_reads
    file("fastp.*")

    script:
    def pe_input = params.single_end ? '' :  "-I \"${reads[1]}\""
    def pe_output1 = params.single_end ? "-o \"${name}_trimmed.fastq.gz\"" :  "-o \"${name}_trimmed_R1.fastq.gz\""
    def pe_output2 = params.single_end ? '' :  "-O \"${name}_trimmed_R2.fastq.gz\""
    """
    fastp -w "${task.cpus}" -q "${qual}" --cut_by_quality5 \
        --cut_by_quality3 --cut_mean_quality "${trim_qual}"\
        --adapter_sequence=${adapter} --adapter_sequence_r2=${adapter_reverse} \
        -i "${reads[0]}" $pe_input $pe_output1 $pe_output2
    """
}

/*
 * Remove host read contamination
 */
(trimmed_reads, ch_trimmed_reads_remove_host) = trimmed_reads.into(2)

process host_bowtie2index {
    tag "${genome}"

    input:
    file(genome) from ch_host_fasta

    output:
    file("bt2_index_base*") into ch_host_bowtie2index

    when: params.host_fasta

    script:
    """
    bowtie2-build --threads "${task.cpus}" "${genome}" "bt2_index_base"
    """
}

process remove_host {
    tag "${name}"

    publishDir "${params.outdir}/QC_shortreads/remove_host/", mode: params.publish_dir_mode,
        saveAs: {filename ->
                    if (filename.indexOf(".fastq.gz") == -1) "$filename"
                    else null
                }

    input:
    set val(name), file(reads) from ch_trimmed_reads_remove_host
    file(index) from ch_host_bowtie2index

    output:
    set val(name), file("${name}_host_unmapped*.fastq.gz") into ch_trimmed_reads_host_removed
    file("${name}.bowtie2.log") into ch_host_removed_log
    file("${name}_host_mapped*.read_ids.txt") optional true

    when: params.host_fasta || params.host_genome

    script:
    def sensitivity = params.host_removal_verysensitive ? "--very-sensitive" : "--sensitive"
    def save_ids = params.host_removal_save_ids ? "Y" : "N"
    if ( !params.single_end ) {
        """
        bowtie2 -p "${task.cpus}" \
                -x ${index[0].getSimpleName()} \
                -1 "${reads[0]}" -2 "${reads[1]}" \
                $sensitivity \
                --un-conc-gz ${name}_host_unmapped_%.fastq.gz \
                --al-conc-gz ${name}_host_mapped_%.fastq.gz \
                1> /dev/null \
                2> ${name}.bowtie2.log

        if [ ${save_ids} = "Y" ] ; then
            gunzip -c ${name}_host_mapped_1.fastq.gz | awk '{if(NR%4==1) print substr(\$0, 2)}' | LC_ALL=C sort > ${name}_host_mapped_1.read_ids.txt
            gunzip -c ${name}_host_mapped_2.fastq.gz | awk '{if(NR%4==1) print substr(\$0, 2)}' | LC_ALL=C sort > ${name}_host_mapped_2.read_ids.txt
        fi
        rm -f ${name}_host_mapped_*.fastq.gz
        """
    } else {
        """
        bowtie2 -p "${task.cpus}" \
                -x ${index[0].getSimpleName()} \
                -U ${reads} \
                $sensitivity \
                --un-gz ${name}_host_unmapped.fastq.gz \
                --al-gz ${name}_host_mapped.fastq.gz \
                1> /dev/null \
                2> ${name}.bowtie2.log

        if [ ${save_ids} = "Y" ] ; then
            gunzip -c ${name}_host_mapped.fastq.gz | awk '{if(NR%4==1) print substr(\$0, 2)}' | LC_ALL=C sort > ${name}_host_mapped.read_ids.txt
        fi
        rm -f ${name}_host_mapped.fastq.gz
        """
    }
}

if ( params.host_fasta || params.host_genome ) trimmed_reads = ch_trimmed_reads_host_removed
else ch_trimmed_reads_remove_host.close()

/*
 * Remove PhiX contamination from Illumina reads
 * TODO: PhiX into/from iGenomes.conf?
 */
if(!params.keep_phix) {
    process phix_download_db {
        tag "${genome}"

        input:
        file(genome) from file_phix_db

        output:
        set file(genome), file("ref*") into phix_db

        script:
        """
        bowtie2-build --threads "${task.cpus}" "${genome}" ref
        """
    }

    process remove_phix {
        tag "$name"

        publishDir "${params.outdir}", mode: params.publish_dir_mode,
            saveAs: {filename -> filename.indexOf(".fastq.gz") == -1 ? "QC_shortreads/remove_phix/$filename" : null}

        input:
        set val(name), file(reads), file(genome), file(db) from trimmed_reads.combine(phix_db)

        output:
        set val(name), file("*.fastq.gz") into (trimmed_reads_megahit, trimmed_reads_metabat, trimmed_reads_fastqc, trimmed_sr_spadeshybrid, trimmed_reads_spades, trimmed_reads_centrifuge, trimmed_reads_kraken2, trimmed_reads_bowtie2, trimmed_reads_filtlong)
        file("${name}_remove_phix.log")

        script:
        if ( !params.single_end ) {
            """
            bowtie2 -p "${task.cpus}" \
                    -x ref \
                    -1 "${reads[0]}" \
                    -2 "${reads[1]}" \
                    --un-conc-gz ${name}_phix_unmapped_%.fastq.gz \
                    1> /dev/null \
                    2> ${name}.bowtie2.log
            echo "Bowtie2 reference: ${genome}" >${name}_remove_phix.log
            gunzip -c ${reads[0]} | echo "Read pairs before removal: \$((`wc -l`/4))" >>${name}_remove_phix.log
            gunzip -c ${name}_phix_unmapped_1.fastq.gz | echo "Read pairs after removal: \$((`wc -l`/4))" >>${name}_remove_phix.log
            """
        } else {
            """
            bowtie2 -p "${task.cpus}" \
                    -x ref \
                    -U ${reads} \
                    --un-gz ${name}_phix_unmapped.fastq.gz \
                    1> /dev/null \
                    2> ${name}.bowtie2.log
            echo "Bowtie2 reference: ${genome}" >${name}_remove_phix.log
            gunzip -c ${reads[0]} | echo "Reads before removal: \$((`wc -l`/4))" >>${name}_remove_phix.log
            gunzip -c ${name}_phix_unmapped.fastq.gz | echo "Reads after removal: \$((`wc -l`/4))" >>${name}_remove_phix.log
            """
        }

    }
} else {
    trimmed_reads.into {trimmed_reads_megahit; trimmed_reads_metabat; trimmed_reads_fastqc; trimmed_sr_spadeshybrid; trimmed_reads_spades; trimmed_reads_centrifuge; trimmed_reads_kraken2; trimmed_reads_bowtie2; trimmed_reads_filtlong}
}

process fastqc_trimmed {
    tag "$name"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode,
        saveAs: {filename -> filename.indexOf(".zip") == -1 ? "QC_shortreads/fastqc/$filename" : null}

    input:
    set val(name), file(reads) from trimmed_reads_fastqc

    output:
    file "*_fastqc.{zip,html}" into fastqc_results_trimmed

    script:
    if ( !params.single_end ) {
        """
        fastqc -t "${task.cpus}" -q ${reads}
        mv *1_fastqc.html "${name}_R1.trimmed_fastqc.html"
        mv *2_fastqc.html "${name}_R2.trimmed_fastqc.html"
        mv *1_fastqc.zip "${name}_R1.trimmed_fastqc.zip"
        mv *2_fastqc.zip "${name}_R2.trimmed_fastqc.zip"
        """
    } else {
        """
        fastqc -t "${task.cpus}" -q ${reads}
        mv *_fastqc.html "${name}.trimmed_fastqc.html"
        mv *_fastqc.zip "${name}.trimmed_fastqc.zip"
        """
    }
}

/*
================================================================================
                                Preprocessing and QC for long reads
================================================================================
*/

/*
 * Trim adapter sequences on long read nanopore files
 */
if (!params.skip_adapter_trimming) {
    process porechop {
        tag "$id"

        input:
        set id, file(lr) from files_long_raw

        output:
        set id, file("${id}_porechop.fastq") into files_porechop
        set id, file(lr), val("raw") into files_nanoplot_raw

        script:
        """
        porechop -i ${lr} -t "${task.cpus}" -o ${id}_porechop.fastq
        """
    }
} else {
    files_long_raw
        .into{ files_porechop; pre_files_nanoplot_raw }
    pre_files_nanoplot_raw
        .map { id, lr -> [ id, lr, "raw" ] }
        .set { files_nanoplot_raw }
}

/*
 * Remove reads mapping to the lambda genome.
 * TODO: add lambda phage to igenomes.config?
 */
if (!params.keep_lambda) {
    Channel
        .fromPath( "${params.lambda_reference}", checkIfExists: true )
        .set { file_nanolyse_db }
    process nanolyse {
        tag "$id"

        publishDir "${params.outdir}", mode: params.publish_dir_mode,
            saveAs: {filename -> filename.indexOf(".fastq.gz") == -1 ? "QC_longreads/NanoLyse/$filename" : null}

        input:
        set id, file(lr), file(nanolyse_db) from files_porechop.combine(file_nanolyse_db)

        output:
        set id, file("${id}_nanolyse.fastq.gz") into files_nanolyse
        file("${id}_nanolyse.log")

        script:
        """
        cat ${lr} | NanoLyse --reference $nanolyse_db | gzip > ${id}_nanolyse.fastq.gz

        echo "NanoLyse reference: $params.lambda_reference" >${id}_nanolyse.log
        cat ${lr} | echo "total reads before NanoLyse: \$((`wc -l`/4))" >>${id}_nanolyse.log
        gunzip -c ${id}_nanolyse.fastq.gz | echo "total reads after NanoLyse: \$((`wc -l`/4))" >>${id}_nanolyse.log
        """
    }
} else {
    files_porechop
        .set{ files_nanolyse }
}

// join long and short (already filtered) reads by sample name
files_nanolyse
    .join(trimmed_reads_filtlong)
    .map{ id, lr, sr -> [ id, lr, sr[0], sr[1] ] }
    .set{ ch_files_filtlong }

/*
 * Quality filter long reads focus on length instead of quality to improve assembly size
 */
process filtlong {
    tag "$id"

    input:
    set id, file(lr), file(sr1), file(sr2) from ch_files_filtlong

    output:
    set id, file("${id}_lr_filtlong.fastq.gz") into files_lr_filtered
    set id, file("${id}_lr_filtlong.fastq.gz"), val('filtered') into files_nanoplot_filtered

    script:
    """
    filtlong \
        -1 ${sr1} \
        -2 ${sr2} \
        --min_length ${params.longreads_min_length} \
        --keep_percent ${params.longreads_keep_percent} \
        --trim \
        --length_weight ${params.longreads_length_weight} \
        ${lr} | gzip > ${id}_lr_filtlong.fastq.gz
    """
}

/*
 * Quality check for nanopore reads and Quality/Length Plots
 */
process nanoplot {
    tag "$id"
    publishDir "${params.outdir}/QC_longreads/NanoPlot_${id}", mode: params.publish_dir_mode

    input:
    set id, file(lr), type from files_nanoplot_raw.mix(files_nanoplot_filtered)

    output:
    file '*.png'
    file '*.html'
    file '*.txt'

    script:
    """
    NanoPlot -t "${task.cpus}" -p ${type}_  --title ${id}_${type} -c darkblue --fastq ${lr}
    """
}

/*
================================================================================
                                Taxonomic information
================================================================================
*/

process centrifuge_db_preparation {
    input:
    file(db) from file_centrifuge_db

    output:
    set val("${db.toString().replace(".tar.gz", "")}"), file("*.cf") into centrifuge_database

    script:
    """
    tar -xf "${db}"
    """
}

trimmed_reads_centrifuge
    .combine(centrifuge_database)
    .set { centrifuge_input }

process centrifuge {
    tag "${name}-${db_name}"
    publishDir "${params.outdir}/Taxonomy/centrifuge/${name}", mode: params.publish_dir_mode,
            saveAs: {filename -> filename.indexOf(".krona") == -1 ? filename : null}

    input:
    set val(name), file(reads), val(db_name), file(db) from centrifuge_input

    output:
    set val("centrifuge"), val(name), file("results.krona") into centrifuge_to_krona
    file("report.txt")
    file("kreport.txt")

    script:
    def input = params.single_end ? "-U \"${reads}\"" :  "-1 \"${reads[0]}\" -2 \"${reads[1]}\""
    """
    centrifuge -x "${db_name}" \
        -p "${task.cpus}" \
        --report-file report.txt \
        -S results.txt \
        $input
    centrifuge-kreport -x "${db_name}" results.txt > kreport.txt
    cat results.txt | cut -f 1,3 > results.krona
    """
}

process kraken2_db_preparation {
    input:
    file(db) from file_kraken2_db

    output:
    set val("${db.baseName}"), file("*/*.k2d") into kraken2_database

    script:
    """
    tar -xf "${db}"
    """
}

trimmed_reads_kraken2
    .combine(kraken2_database)
    .set { kraken2_input }

process kraken2 {
    tag "${name}-${db_name}"
    publishDir "${params.outdir}/Taxonomy/kraken2/${name}", mode: params.publish_dir_mode,
            saveAs: {filename -> filename.indexOf(".krona") == -1 ? filename : null}

    input:
    set val(name), file(reads), val(db_name), file("database/*") from kraken2_input

    output:
    set val("kraken2"), val(name), file("results.krona") into kraken2_to_krona
    file("kraken2_report.txt")

    script:
    def input = params.single_end ? "\"${reads}\"" :  "--paired \"${reads[0]}\" \"${reads[1]}\""
    """
    kraken2 \
        --report-zero-counts \
        --threads "${task.cpus}" \
        --db database \
        --report kraken2_report.txt \
        $input \
        > kraken2.kraken
    cat kraken2.kraken | cut -f 2,3 > results.krona
    """
}

process krona_db {
    output:
    file("taxonomy/taxonomy.tab") into file_krona_db

    when:
    ( params.centrifuge_db || params.kraken2_db ) && !params.skip_krona

    script:
    """
    ktUpdateTaxonomy.sh taxonomy
    """
}

centrifuge_to_krona
    .mix(kraken2_to_krona)
    .combine(file_krona_db)
    .set { krona_input }

process krona {
    tag "${classifier}-${name}"
    publishDir "${params.outdir}/Taxonomy/${classifier}/${name}", mode: params.publish_dir_mode

    input:
    set val(classifier), val(name), file(report), file("taxonomy/taxonomy.tab") from krona_input

    output:
    file("*.html")

    script:
    """
    ktImportTaxonomy "$report" -tax taxonomy
    """
}

/*
================================================================================
                                Assembly
================================================================================
*/

process megahit {
    tag "$name"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode,
        saveAs: {filename -> 
          if (filename.indexOf(".log") > 0 || filename.indexOf(".contigs.fa.gz") > 0 ) "Assembly/$filename"
          else null}

    input:
    set val(name), file(reads) from trimmed_reads_megahit

    output:
    set val("MEGAHIT"), val("$name"), file("MEGAHIT/${name}.contigs.fa") into (assembly_megahit_to_quast, assembly_megahit_to_metabat)
    file("MEGAHIT/*.log")
    file("MEGAHIT/${name}.contigs.fa.gz")

    when:
    !params.skip_megahit

    script:
    def input = params.single_end ? "-r \"${reads}\"" :  "-1 \"${reads[0]}\" -2 \"${reads[1]}\""
    mem = task.memory.toBytes()
    if ( !params.megahit_fix_cpu_1 || task.cpus == 1 )
        """
        megahit -t "${task.cpus}" -m $mem $input -o MEGAHIT --out-prefix "${name}"
        gzip -c "MEGAHIT/${name}.contigs.fa" > "MEGAHIT/${name}.contigs.fa.gz"
        """
    else
        error "ERROR: '--megahit_fix_cpu_1' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file."
}


/*
 * metaSpades hybrid Assembly
 */

 files_lr_filtered
    .combine(trimmed_sr_spadeshybrid, by: 0)
    .set { files_pre_spadeshybrid }

process spadeshybrid {
    tag "$id"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode, pattern: "${id}*",
        saveAs: {filename -> 
          if (filename.indexOf(".log") > 0 || filename.indexOf("_scaffolds.fasta.gz") > 0 || filename.indexOf("_graph.gfa.gz") > 0 || filename.indexOf("_contigs.fasta.gz") > 0 ) "Assembly/SPAdesHybrid/$filename"
          else null}

    input:
    set id, file(lr), file(sr) from files_pre_spadeshybrid  

    output:
    set val("SPAdesHybrid"), val("$id"), file("${id}_scaffolds.fasta") into (assembly_spadeshybrid_to_quast, assembly_spadeshybrid_to_metabat)
    file("${id}.log")
    file("${id}_contigs.fasta.gz")
    file("${id}_scaffolds.fasta.gz")
    file("${id}_graph.gfa.gz")

    when:
    params.manifest && !params.single_end && !params.skip_spadeshybrid
     
    script:
    maxmem = task.memory.toGiga()
    if ( !params.spadeshybrid_fix_cpus || task.cpus == params.spadeshybrid_fix_cpus )
        """
        metaspades.py \
            --threads "${task.cpus}" \
            --memory $maxmem \
            --pe1-1 ${sr[0]} \
            --pe1-2 ${sr[1]} \
            --nanopore ${lr} \
            -o spades
        mv spades/assembly_graph_with_scaffolds.gfa ${id}_graph.gfa
        mv spades/scaffolds.fasta ${id}_scaffolds.fasta
        mv spades/contigs.fasta ${id}_contigs.fasta
        mv spades/spades.log ${id}.log
        gzip "${id}_contigs.fasta"
        gzip "${id}_graph.gfa"
        gzip -c "${id}_scaffolds.fasta" > "${id}_scaffolds.fasta.gz"
        """
    else
        error "ERROR: '--spadeshybrid_fix_cpus' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file."
}


process spades {
    tag "$id"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode, pattern: "${id}*",
        saveAs: {filename -> 
          if (filename.indexOf(".log") > 0 || filename.indexOf("_scaffolds.fasta.gz") > 0 || filename.indexOf("_graph.gfa.gz") > 0 || filename.indexOf("_contigs.fasta.gz") > 0 ) "Assembly/SPAdes/$filename"
          else null}
    input:
    set id, file(sr) from trimmed_reads_spades

    output:
    set val("SPAdes"), val("$id"), file("${id}_scaffolds.fasta") into (assembly_spades_to_quast, assembly_spades_to_metabat)
    file("${id}.log")
    file("${id}_contigs.fasta.gz")
    file("${id}_scaffolds.fasta.gz")
    file("${id}_graph.gfa.gz")

    when:
    !params.single_end && !params.skip_spades
     
    script:
    maxmem = task.memory.toGiga()
    if ( !params.spades_fix_cpus || task.cpus == params.spades_fix_cpus )
        """
        metaspades.py \
            --threads "${task.cpus}" \
            --memory $maxmem \
            --pe1-1 ${sr[0]} \
            --pe1-2 ${sr[1]} \
            -o spades
        mv spades/assembly_graph_with_scaffolds.gfa ${id}_graph.gfa
        mv spades/scaffolds.fasta ${id}_scaffolds.fasta
        mv spades/contigs.fasta ${id}_contigs.fasta
        mv spades/spades.log ${id}.log
        gzip "${id}_contigs.fasta"
        gzip "${id}_graph.gfa"
        gzip -c "${id}_scaffolds.fasta" > "${id}_scaffolds.fasta.gz"
        """
    else
        error "ERROR: '--spades_fix_cpus' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file."
}


process quast {
    tag "$assembler-$sample"
    publishDir "${params.outdir}/Assembly/$assembler", mode: params.publish_dir_mode

    input:
    set val(assembler), val(sample), file(assembly) from assembly_spades_to_quast.mix(assembly_megahit_to_quast).mix(assembly_spadeshybrid_to_quast)

    output:
    file("${sample}_QC/*") into quast_results

    when:
    !params.skip_quast

    script:
    """
    metaquast.py --threads "${task.cpus}" --rna-finding --max-ref-number 0 -l "${assembler}-${sample}" "${assembly}" -o "${sample}_QC"
    """
}

bowtie2_input = Channel.empty()

assembly_all_to_metabat = assembly_spades_to_metabat.mix(assembly_megahit_to_metabat,assembly_spadeshybrid_to_metabat)

(assembly_all_to_metabat, assembly_all_to_metabat_copy) = assembly_all_to_metabat.into(2)

bowtie2_input = assembly_all_to_metabat
    .combine(trimmed_reads_bowtie2)

(bowtie2_input, bowtie2_input_copy) = bowtie2_input.into(2)

/*
================================================================================
                                Binning
================================================================================
*/

process bowtie2 {
    tag "$assembler-$sample"

    input:
    set val(assembler), val(sample), file(assembly), val(sampleToMap), file(reads) from bowtie2_input

    output:
    set val(assembler), val(sample), file("${assembler}-${sample}-${sampleToMap}.bam"), file("${assembler}-${sample}-${sampleToMap}.bam.bai") into assembly_mapping_for_metabat

    when:
    !params.skip_binning

    script:
    def name = "${assembler}-${sample}-${sampleToMap}"
    def input = params.single_end ? "-U \"${reads}\"" :  "-1 \"${reads[0]}\" -2 \"${reads[1]}\""
        """
        bowtie2-build --threads "${task.cpus}" "${assembly}" ref
        bowtie2 -p "${task.cpus}" -x ref $input | \
            samtools view -@ "${task.cpus}" -bS | \
            samtools sort -@ "${task.cpus}" -o "${name}.bam"
        samtools index "${name}.bam"
        """
}

assembly_mapping_for_metabat = assembly_mapping_for_metabat.groupTuple(by:[0,1]).join(assembly_all_to_metabat_copy, by:[0,1])

assembly_mapping_for_metabat = assembly_mapping_for_metabat.dump(tag:'assembly_mapping_for_metabat')

process metabat {
    tag "$assembler-$sample"
    publishDir "${params.outdir}/", mode: params.publish_dir_mode,
        saveAs: {filename -> (filename.indexOf(".bam") == -1 && filename.indexOf(".fastq.gz") == -1) ? "GenomeBinning/$filename" : null}

    input:
    set val(assembler), val(sample), file(bam), file(index), file(assembly) from assembly_mapping_for_metabat
    val(min_size) from params.min_contig_size
    val(max_unbinned) from params.max_unbinned_contigs
    val(min_length_unbinned) from params.min_length_unbinned_contigs

    output:
    set val(assembler), val(sample), file("MetaBAT2/*.fa") into (metabat_bins, metabat_bins_for_cat, metabat_bins_quast_bins)
    file("MetaBAT2/discarded/*")
    file("${assembler}-${assembly}-depth.txt.gz")

    when:
    !params.skip_binning

    script:
    def name = "${assembler}-${sample}"
    """
    OMP_NUM_THREADS=${task.cpus} jgi_summarize_bam_contig_depths --outputDepth depth.txt ${bam}
    gzip -c depth.txt > "${assembler}-${assembly}-depth.txt.gz"
    metabat2 -t "${task.cpus}" -i "${assembly}" -a depth.txt -o "MetaBAT2/${name}" -m ${min_size} --unbinned --seed ${params.metabat_rng_seed}

    #save unbinned contigs above thresholds into individual files, dump others in one file
    split_fasta.py MetaBAT2/${name}.unbinned.fa ${min_length_unbinned} ${max_unbinned} ${min_size}

    mkdir MetaBAT2/discarded
    mv MetaBAT2/${name}.lowDepth.fa MetaBAT2/discarded/
    mv MetaBAT2/${name}.tooShort.fa MetaBAT2/discarded/
    mv MetaBAT2/${name}.unbinned.pooled.fa MetaBAT2/discarded/
    mv MetaBAT2/${name}.unbinned.remaining.fa MetaBAT2/discarded/

    #rename splitted file so that it doesnt end up in following processes
    mv MetaBAT2/${name}.unbinned.fa ${name}.unbinned.fa
    """
}

process busco_db_preparation {
    tag "${database.baseName}"
    publishDir "${params.outdir}/GenomeBinning/QC/BUSCO/", mode: params.publish_dir_mode,
        saveAs: {filename -> (params.save_busco_reference && filename.indexOf(".tar.gz") > 0) ? "reference/$filename" : null}

    input:
    file(database) from file_busco_db

    output:
    file("buscodb/*") into busco_db
    file(database)

    script:
    """
    mkdir buscodb
    tar -xf ${database} -C buscodb
    """
}

metabat_bins
    .transpose()
    .combine(busco_db)
    .set { metabat_db_busco }

/*
 * BUSCO: Quantitative measures for the assessment of genome assembly
 */
process busco {
    tag "${bin}"
    publishDir "${params.outdir}/GenomeBinning/QC/BUSCO/", mode: params.publish_dir_mode

    input:
    set val(assembler), val(sample), file(bin), file(db) from metabat_db_busco

    output:
    set val(assembler), val(sample), file("short_summary.specific.*.${bin}.txt") into (ch_busco_multiqc, ch_busco_to_summary, ch_busco_plot)
    file("${bin}_busco.log")
    file("${bin}_buscos.faa.gz") optional true
    file("${bin}_buscos.fna.gz") optional true

    script:
    if( workflow.profile.toString().indexOf("conda") == -1)
        cp_augustus_config = "Y"
    else
        cp_augustus_config = "N"

    """
    # get path to custom config file for busco (already configured during conda installation)
    busco_path="\$(which busco)"
    config_file="\${busco_path%bin/busco}share/busco/config.ini"

    # ensure augustus has write access to config directory
    if [ ${cp_augustus_config} = "Y" ] ; then
        cp -r /opt/conda/pkgs/augustus*/config augustus_config/
        export AUGUSTUS_CONFIG_PATH=augustus_config
    fi

    # place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it)
    mkdir dataset
    mv ${db} dataset/

    busco --lineage_dataset dataset/${db} \
        --mode genome \
        --in ${bin} \
        --config \${config_file} \
        --cpu "${task.cpus}" \
        --out "BUSCO" > ${bin}_busco.log

    # get used db name
    # (set nullgob: if pattern matches no files, expand to a null string rather than to itself)
    shopt -s nullglob
    summaries=(BUSCO/short_summary.specific.*.BUSCO.txt)
    if [ \${#summaries[@]} -ne 1 ]; then
        echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one."
        exit 1
    fi
    [[ \$summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]];
    db_name="\${BASH_REMATCH[1]}"
    echo "Used database: \${db_name}"

    cp BUSCO/short_summary.specific.\${db_name}.BUSCO.txt short_summary.specific.\${db_name}.${bin}.txt

    for f in BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*faa; do
        cat BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.faa.gz
        break
    done
    for f in BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*fna; do
        cat BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.fna.gz
        break
    done
    """
}

// preprare channels for downstream processes
ch_busco_multiqc = ch_busco_multiqc.map{it[2]}
ch_busco_to_summary = ch_busco_to_summary.map{it[2]}

// group by assembler and sample for plotting
ch_busco_plot = ch_busco_plot.groupTuple(by: [0,1])

process busco_plot {
    tag "$assembler-$sample"
    publishDir "${params.outdir}/GenomeBinning/QC/BUSCO/", mode: params.publish_dir_mode

    input:
    set val(assembler), val(sample), file(summaries) from ch_busco_plot

    output:
    file("${assembler}-${sample}-busco_figure.png")
    file("${assembler}-${sample}-busco_figure.R")
    file("${assembler}-${sample}-busco_summary.txt")

    script:
    def name = "${assembler}-${sample}"
    """
    # replace dots in bin names within summary file names by underscores
    # currently (BUSCO v4.1.3) generate_plot.py does not allow further dots
    for sum in ${summaries}; do
        [[ \${sum} =~ short_summary.(.*).${name}.(.*).txt ]];
        db_name=\${BASH_REMATCH[1]}
        bin="${name}.\${BASH_REMATCH[2]}"
        bin_new="\${bin//./_}"
        mv \${sum} short_summary.\${db_name}.\${bin_new}.txt
    done
    generate_plot.py --working_directory .

    mv busco_figure.png ${assembler}-${sample}-busco_figure.png
    mv busco_figure.R ${assembler}-${sample}-busco_figure.R

    summary_busco.py short_summary.*.txt > ${assembler}-${sample}-busco_summary.txt
    """
}

process busco_summary {
    publishDir "${params.outdir}/GenomeBinning/QC/", mode: params.publish_dir_mode

    input:
    file("short_summary.*.txt") from ch_busco_to_summary.collect()

    output:
    file("busco_summary.txt") into busco_summary

    script:
    """
    summary_busco.py short_summary.*.txt > busco_summary.txt
    """
}


process quast_bins {
    tag "$assembler-$sample"
    publishDir "${params.outdir}/GenomeBinning/QC/", mode: params.publish_dir_mode

    input:
    set val(assembler), val(sample), file(bins) from metabat_bins_quast_bins

    output:
    path("QUAST/*") type('dir')
    file("QUAST/*-quast_summary.tsv") into quast_bin_summaries

    when:
    !params.skip_quast

    script:
    """
    BINS=\$(echo \"$bins\" | sed 's/[][]//g')
    IFS=', ' read -r -a bins <<< \"\$BINS\"

    for bin in \"\${bins[@]}\"; do
        metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}"
        if ! [ -f "QUAST/${assembler}-${sample}-quast_summary.tsv" ]; then 
            cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${assembler}-${sample}-quast_summary.tsv"
        else
            tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${assembler}-${sample}-quast_summary.tsv"
        fi
    done
    """
}

process merge_quast_and_busco {
    publishDir "${params.outdir}/GenomeBinning/QC/", mode: params.publish_dir_mode

    input:
    file(quast_bin_sum) from quast_bin_summaries.collect()
    file(busco_sum) from busco_summary

    output:
    file("quast_and_busco_summary.tsv")
    file("quast_summary.tsv")

    script:
    """
    QUAST_BIN=\$(echo \"$quast_bin_sum\" | sed 's/[][]//g')
    IFS=', ' read -r -a quast_bin <<< \"\$QUAST_BIN\"
    
    for quast_file in \"\${quast_bin[@]}\"; do
        if ! [ -f "quast_summary.tsv" ]; then 
            cp "\${quast_file}" "quast_summary.tsv"
        else
            tail -n +2 "\${quast_file}" >> "quast_summary.tsv"
        fi
    done   

    combine_tables.py $busco_sum quast_summary.tsv >quast_and_busco_summary.tsv
    """
}

/*
 * CAT: Bin Annotation Tool (BAT) are pipelines for the taxonomic classification of long DNA sequences and metagenome assembled genomes (MAGs/bins)
 */
process cat_db {
    tag "${database.baseName}"

    input:
    file(database) from file_cat_db

    output:
    set val("${database.toString().replace(".tar.gz", "")}"), file("database/*"), file("taxonomy/*") into cat_db

    script:
    """
    mkdir catDB
    tar -xf ${database} -C catDB
    mv `find catDB/ -type d -name "*taxonomy*"` taxonomy/
    mv `find catDB/ -type d -name "*CAT_database*"` database/
    """
}

metabat_bins_for_cat
    .combine(cat_db)
    .set { cat_input }

process cat {
    tag "${assembler}-${sample}-${db_name}"
    publishDir "${params.outdir}/Taxonomy/${assembler}", mode: params.publish_dir_mode,
    saveAs: {filename ->
        if (filename.indexOf(".names.txt") > 0) filename
        else "raw/$filename"
    }

    input:
    set val(assembler), val(sample), file("bins/*"), val(db_name), file("database/*"), file("taxonomy/*") from cat_input

    output:
    file("*.ORF2LCA.txt")
    file("*.names.txt")
    file("*.predicted_proteins.faa")
    file("*.predicted_proteins.gff")
    file("*.log")
    file("*.bin2classification.txt")

    script:
    """
    CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${assembler}-${sample}" --I_know_what_Im_doing
    CAT add_names -i "${assembler}-${sample}.ORF2LCA.txt" -o "${assembler}-${sample}.ORF2LCA.names.txt" -t taxonomy/
    CAT add_names -i "${assembler}-${sample}.bin2classification.txt" -o "${assembler}-${sample}.bin2classification.names.txt" -t taxonomy/
    """
}

/*
================================================================================
                                MultiQC
================================================================================
*/

process multiqc {
    publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode

    input:
    file (multiqc_config) from ch_multiqc_config
    file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([])
    file (fastqc_raw:'fastqc/*') from fastqc_results.collect().ifEmpty([])
    file (fastqc_trimmed:'fastqc/*') from fastqc_results_trimmed.collect().ifEmpty([])
    file (host_removal) from ch_host_removed_log.collect().ifEmpty([])
    file ('quast*/*') from quast_results.collect().ifEmpty([])
    file (short_summary) from ch_busco_multiqc.collect().ifEmpty([])
    file ('software_versions/*') from ch_software_versions_yaml.collect()
    file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")

    output:
    file "*multiqc_report.html" into ch_multiqc_report
    file "*_data"
    file "multiqc_plots"

    script:
    rtitle = custom_runName ? "--title \"$custom_runName\"" : ''
    rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : ''
    custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : ''
    read_type = params.single_end ? "--single_end" : ''
    if ( params.host_fasta || params.host_genome ) {
        """
        # get multiqc parsed data for bowtie 2
        multiqc -f $rtitle $rfilename $custom_config_file *.bowtie2.log
        multiqc_to_custom_tsv.py ${read_type}
        # run multiqc using custom content file instead of original bowtie2 log files
        multiqc -f $rtitle $rfilename $custom_config_file --ignore "*.bowtie2.log" .
        """
    } else {
        """
        multiqc -f $rtitle $rfilename $custom_config_file .
        """
    }
}

/*
 * Output Description HTML
 */
process output_documentation {
    publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode

    input:
    file output_docs from ch_output_docs
    file images from ch_output_docs_images

    output:
    file "results_description.html"

    script:
    """
    markdown_to_html.py $output_docs -o results_description.html
    """
}

/*
 * Completion e-mail notification
 */
workflow.onComplete {

    // Set up the e-mail variables
    def subject = "[nf-core/mag] Successful: $workflow.runName"
    if (!workflow.success) {
        subject = "[nf-core/mag] FAILED: $workflow.runName"
    }
    def email_fields = [:]
    email_fields['version'] = workflow.manifest.version
    email_fields['runName'] = custom_runName ?: workflow.runName
    email_fields['success'] = workflow.success
    email_fields['dateComplete'] = workflow.complete
    email_fields['duration'] = workflow.duration
    email_fields['exitStatus'] = workflow.exitStatus
    email_fields['errorMessage'] = (workflow.errorMessage ?: 'None')
    email_fields['errorReport'] = (workflow.errorReport ?: 'None')
    email_fields['commandLine'] = workflow.commandLine
    email_fields['projectDir'] = workflow.projectDir
    email_fields['summary'] = summary
    email_fields['summary']['Date Started'] = workflow.start
    email_fields['summary']['Date Completed'] = workflow.complete
    email_fields['summary']['Pipeline script file path'] = workflow.scriptFile
    email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId
    if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository
    if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId
    if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision
    email_fields['summary']['Nextflow Version'] = workflow.nextflow.version
    email_fields['summary']['Nextflow Build'] = workflow.nextflow.build
    email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp

    // On success try attach the multiqc report
    def mqc_report = null
    try {
        if (workflow.success) {
            mqc_report = ch_multiqc_report.getVal()
            if (mqc_report.getClass() == ArrayList) {
                log.warn "[nf-core/mag] Found multiple reports from process 'multiqc', will use only one"
                mqc_report = mqc_report[0]
            }
        }
    } catch (all) {
        log.warn "[nf-core/mag] Could not attach MultiQC report to summary email"
    }

    // Check if we are only sending emails on failure
    email_address = params.email
    if (!params.email && params.email_on_fail && !workflow.success) {
        email_address = params.email_on_fail
    }

    // Render the TXT template
    def engine = new groovy.text.GStringTemplateEngine()
    def tf = new File("$baseDir/assets/email_template.txt")
    def txt_template = engine.createTemplate(tf).make(email_fields)
    def email_txt = txt_template.toString()

    // Render the HTML template
    def hf = new File("$baseDir/assets/email_template.html")
    def html_template = engine.createTemplate(hf).make(email_fields)
    def email_html = html_template.toString()

    // Render the sendmail template
    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
    def sf = new File("$baseDir/assets/sendmail_template.txt")
    def sendmail_template = engine.createTemplate(sf).make(smail_fields)
    def sendmail_html = sendmail_template.toString()

    // Send the HTML e-mail
    if (email_address) {
        try {
            if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
            // Try to send HTML e-mail using sendmail
            [ 'sendmail', '-t' ].execute() << sendmail_html
            log.info "[nf-core/mag] Sent summary e-mail to $email_address (sendmail)"
        } catch (all) {
            // Catch failures and try with plaintext
            def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ]
            if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) {
              mail_cmd += [ '-A', mqc_report ]
            }
            mail_cmd.execute() << email_html
            log.info "[nf-core/mag] Sent summary e-mail to $email_address (mail)"
        }
    }

    // Write summary e-mail HTML to a file
    def output_d = new File("${params.outdir}/pipeline_info/")
    if (!output_d.exists()) {
        output_d.mkdirs()
    }
    def output_hf = new File(output_d, "pipeline_report.html")
    output_hf.withWriter { w -> w << email_html }
    def output_tf = new File(output_d, "pipeline_report.txt")
    output_tf.withWriter { w -> w << email_txt }

    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_red = params.monochrome_logs ? '' : "\033[0;31m";
    c_reset = params.monochrome_logs ? '' : "\033[0m";

    if (workflow.stats.ignoredCount > 0 && workflow.success) {
        log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-"
        log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-"
        log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-"
    }

    if (workflow.success) {
        log.info "-${c_purple}[nf-core/mag]${c_green} Pipeline completed successfully${c_reset}-"
    } else {
        checkHostname()
        log.info "-${c_purple}[nf-core/mag]${c_red} Pipeline completed with errors${c_reset}-"
    }

}


def nfcoreHeader() {
    // Log colors ANSI codes
    c_black = params.monochrome_logs ? '' : "\033[0;30m";
    c_blue = params.monochrome_logs ? '' : "\033[0;34m";
    c_cyan = params.monochrome_logs ? '' : "\033[0;36m";
    c_dim = params.monochrome_logs ? '' : "\033[2m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_white = params.monochrome_logs ? '' : "\033[0;37m";
    c_yellow = params.monochrome_logs ? '' : "\033[0;33m";

    return """    -${c_dim}--------------------------------------------------${c_reset}-
                                            ${c_green},--.${c_black}/${c_green},-.${c_reset}
    ${c_blue}        ___     __   __   __   ___     ${c_green}/,-._.--~\'${c_reset}
    ${c_blue}  |\\ | |__  __ /  ` /  \\ |__) |__         ${c_yellow}}  {${c_reset}
    ${c_blue}  | \\| |       \\__, \\__/ |  \\ |___     ${c_green}\\`-._,-`-,${c_reset}
                                            ${c_green}`._,._,\'${c_reset}
    ${c_purple}  nf-core/mag v${workflow.manifest.version}${c_reset}
    -${c_dim}--------------------------------------------------${c_reset}-
    """.stripIndent()
}

def checkHostname() {
    def c_reset = params.monochrome_logs ? '' : "\033[0m"
    def c_white = params.monochrome_logs ? '' : "\033[0;37m"
    def c_red = params.monochrome_logs ? '' : "\033[1;91m"
    def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m"
    if (params.hostnames) {
        def hostname = "hostname".execute().text.trim()
        params.hostnames.each { prof, hnames ->
            hnames.each { hname ->
                if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
                    log.error "====================================================\n" +
                            "  ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" +
                            "  but your machine hostname is ${c_white}'$hostname'${c_reset}\n" +
                            "  ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" +
                            "============================================================"
                }
            }
        }
    }
}