main.nf

 #!/usr/bin/env nextflow
/*
========================================================================================
                         nf-core/nanoclust
========================================================================================
 nf-core/nanoclust Analysis Pipeline.
 #### Homepage / Documentation
 https://github.com/nf-core/nanoclust
----------------------------------------------------------------------------------------
*/

//nextflow.enable.dsl = 2

import groovy.io.FileType

log.info nfcoreHeader()
def helpMessage() {
    
    log.info"""

    Usage:

    The typical command for running the pipeline is as follows:

    nextflow run nf-core/nanoclust --reads 'reads.fastq' --db "path/to/db" --tax "path/to/taxdb" -profile conda

    Mandatory arguments:
      --reads                       Path to input data (must be surrounded with quotes)
      -profile                      Configuration profile to use. Can use multiple (comma separated)
                                    Available: conda, docker, singularity, awsbatch, test and more.

    UMAP and HDBSCAN clustering parameters:
      --umap_set_size               Number of reads used to perform the UMAP+HDBSCAN clustering (100000)
      --umap_n_neighbors            The size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data (15)
      --umap_min_dist               The minimum distance apart that points are allowed to be in the low dimensional representation. (0.1)
      --cluster_sel_epsilon         Minimun distance to separate clusters. (0.5)
      --min_cluster_size            Minimum number of reads to call a independent cluster (100)
      --min_samples                 Measure of how conservative the clustering should be. Default is None.
      --min_read_length             Minimum number of base pair in sequence reads (1400)
      --max_read_length             Maximum number of base pair in sequence reads (1700)
      --avg_amplicon_size           Average size for the sequenced amplicon (ie: 1.5k for 16S/1.8k for 18S)

    Canu consensus correction options:
      --stopOnLowCoverage           Default (1)
      --minInputCoverage            Default (2)
      --minReadLength               Default (500)
      --minOverlapLength            Default (200)
      --useGrid                     Default (false)

    Classification options:
      --classification              Classification algorithm to choose from: seqmatch, kraken2, blast or full. Default (blast)
      --db                          Path to local database folder. If not specified for blast, search will be done againts NCBI 16S Microbial
      --tax                         Path to taxdb database which contains the names for the --db entries (blast) or RankedLineage.dmp file (kraken2)
      --accession                   Path to accession file with mapping between RDP tags and taxid. Required only for seqmatch.
      --reclassifyOnFail            Whether to reclassify kraken2 results with Seqmatch if species resolution hasn't been achieved. Redundant when classify set to full. (false)
      --db2                         Path to seqmatch database required if reclassifyOnFail is set to true or classification set to full. 
      --blast_db                    Path to blast nucleotide database required if classification set to full. 

    Reports options:
      --generateReports            Whether to generate PDF sample reports (false)
      --experimentInfo             File detailing experiment info to be included in the final sample PDF report, mandatory with --generateReports option (default file name: experiment_info.xlsx)


    Other options:
      --demultiplex                 Set this parameter if you file is a pooled sample
      --demultiplex_porechop        Same as --demultiplex but uses Porechop for the task
      --kit                         (Only with --demultiplex) Barcoding kit (RAB204) {Auto,PBC096,RBK004,NBD104/NBD114,PBK004/LWB001,RBK001,RAB204,VMK001,PBC001,NBD114,NBD103/NBD104,DUAL,RPB004/RLB001}
      --polishing_reads             Number of reads used for polishing (100)
      --outdir                      The output directory where the results will be saved
      --email                       Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
      --email_on_fail               Same as --email, except only send mail if the workflow is not successful
      --maxMultiqcEmailFileSize     Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
      --clusterOpts                 Additional options if running on the cluster (eg. "-P proj_name")
      -name                         Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.
    """.stripIndent()
}

// Show help message
if (params.help) {
    helpMessage()
    exit 0
}

/*
 * SET UP CONFIGURATION VARIABLES
 */

def racon_warnings = []

if(params.demultiplex) {
    Channel.fromPath(params.reads).set { multiplexed_reads }
}
else if(params.demultiplex_porechop){
    Channel.fromPath(params.reads).set { multiplexed_reads_porechop }
}
else if(params.guppy_barcoder) {
    Channel.fromPath(params.reads, type:'dir').set { for_guppy_demux }
}
else{
    Channel.fromPath(params.reads, type:'dir').set { pre_reads }
}

if(params.onGridIon) {
    Channel.fromPath(["$workflow.launchDir/../report*.html", "$workflow.launchDir/../final_summary*.txt"]).set { metadata_files }
}

if(params.generateReports) {
    Channel.fromPath(params.experimentInfo).set { samplesheet_ch }
}

// Has the run name been specified by the user?
//  this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
  custom_runName = workflow.runName
}
// Stage config files
ch_multiqc_config = file(params.multiqc_config, checkIfExists: true)
ch_output_docs = file("$baseDir/docs/3pipeline_output.md", checkIfExists: true)

// Header log info
//log.info nfcoreHeader()

def summary = [:]
if (workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name']         = custom_runName ?: workflow.runName
// TODO nf-core: Report custom parameters here
summary['Reads']            = params.reads
summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output dir']       = params.outdir
summary['Launch dir']       = workflow.launchDir
summary['Working dir']      = workflow.workDir
summary['Script dir']       = workflow.projectDir
summary['User']             = workflow.userName

summary['Config Profile'] = workflow.profile
if (params.config_profile_description) summary['Config Description'] = params.config_profile_description
if (params.config_profile_contact)     summary['Config Contact']     = params.config_profile_contact
if (params.config_profile_url)         summary['Config URL']         = params.config_profile_url
if (params.email || params.email_on_fail) {
  summary['E-mail Address']    = params.email
  summary['E-mail on failure'] = params.email_on_fail
  summary['MultiQC maxsize']   = params.maxMultiqcEmailFileSize
}
log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"

// Check the hostnames against configured profiles
checkHostname()

def create_workflow_summary(summary) {
    def yaml_file = workDir.resolve('workflow_summary_mqc.yaml')
    yaml_file.text  = """
    id: 'nf-core-nanoclust-summary'
    description: " - this information is collected when the pipeline is started."
    section_name: 'nf-core/nanoclust Workflow Summary'
    section_href: 'https://github.com/nf-core/nanoclust'
    plot_type: 'html'
    data: |
        <dl class=\"dl-horizontal\">
${summary.collect { k,v -> "            <dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }.join("\n")}
        </dl>
    """.stripIndent()

   return yaml_file
}

def resolve_blast_db_path (path) {
    if(path ==~ /^\/.*/)
        path
    else if(path ==~ /^\.\/.*/)
        "$projectDir/" + path
    else if(workflow.profile == 'conda' || workflow.profile == 'test,conda')
        "$baseDir/" + path
    else
        "/tmp/" + path
}

/*
 * Parse software version numbers
 */
/*
process get_software_versions {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy',
        saveAs: { filename ->
            if (filename.indexOf(".csv") > 0) filename
            else null
        }

    output:
    file 'software_versions_mqc.yaml' into software_versions_yaml
    file "software_versions.csv"

    script:
    // TODO nf-core: Get all tools to print their version number here
    """
    echo $workflow.manifest.version > v_pipeline.txt
    echo $workflow.nextflow.version > v_nextflow.txt
    """
} */

/*
 * STEP 1 - Quality control
 */

good_reads = 0
cluster_count = []

if(params.demultiplex) {
    process demultiplex {
        publishDir "${params.outdir}/demultiplexed_samples", mode: 'copy'

        input:
        file(reads) from multiplexed_reads

        output:
        file("barcode*.fastq") into reads mode flatten

        script:
        kit = params.kit
        """
        qcat -f $reads -k $kit --trim -t ${task.cpus} -b .
        """
    }
}

if(params.demultiplex_porechop){
    process demultiplex_porechop {
        input:
        file(reads) from multiplexed_reads_porechop

        output:
        file("BC*.fastq") into reads mode flatten

        script:
        """
        porechop -i "${reads}" -t ${task.cpus} -b .
        """
    }
}

if(params.onGridIon){

    process process_metadata {

        input:
        tuple file(report), file(summ) from metadata_files.collect()

        output:
        tuple env(kit), env(run_id), env(seq_start) into barcoding_kit

        script:
        """
        if grep -q '"Expansion kit", "value":' ${report}
        then
            kit=\$(grep '"Expansion kit", "value":' ${report} | grep -o -P 'Expansion.{0,35}' | cut -d '"' -f5)
        # elif grep -q "Expansion kit" ${report}
        # then
        #     kit=\$(grep "Expansion kit" ${report} -A1 | tail -n1| grep -oP '(?<=\\>).*?(?=\\<)')
        else
            kit=\$(grep '"Kit type", "value":' ${report} | grep -o -P 'Kit.{0,35}' | cut -d '"' -f5)
            
        fi
        run_id=\$(grep "protocol_run_id=" ${summ} | cut -d "=" -f2)
        seq_start=\$(grep 'started=' ${summ} | cut -d "=" -f2 | cut -d "." -f1 | sed 's/-/\\//g' | sed 's/T/ /g' | awk 'BEGIN{FS=OFS=" "} {split(\$1, a, /\\//); \$1 = a[3] "/" a[2] "/" a[1]} 1')
        echo \$kit
        echo \$run_id
        echo \$seq_start
        """

    }
    if(params.guppy_barcoder){
        process guppy_barcoder {
            publishDir "${params.outdir}/guppy_demux", mode: 'copy'

            input:
            file(reads) from for_guppy_demux
            tuple val(kit), val(run_id), val(seq_start) from barcoding_kit.collect()

            output:
            file("barcode*.fastq") into reads mode flatten

            script:
            """
            guppy_barcoder -i $reads -s . -r --barcode_kits ${kit} --require_barcodes_both_ends -x cuda:0
            for i in barcode*; do cat \$i/* > \$i.fastq; done
            """
        }
    }
    
}

process cat_fastqs {
    input:
    path(pre_reads) from pre_reads

    output:
    file("*.fastq.gz") into reads mode flatten

    script:
    """
    for i in ${pre_reads}/barcode*; do
        barcode=\$(basename \$i)
        output_file=\$barcode\\.fastq.gz
    
        if find \$i -name '*.fastq.gz' -type f -print -quit 2>/dev/null | grep -q '.'; then
            cat \$i/*.fastq.gz > \$output_file
        elif find \$i -name '*.fastq' -type f -print -quit 2>/dev/null | grep -q '.'; then
            cat \$i/*.fastq | gzip -c --best > \$output_file
        fi
    done


    """
}

process QC {
    publishDir "${params.outdir}/QC_reports/", mode: 'copy', pattern: '*.html'

    input:
    file(reads) from reads

    output:
    tuple env(barcode), file("*qced_reads.fastq") into qc_results
    tuple env(barcode), env(reads_count) into reads_count_ch
    file("*.{html,json}")

    script:
    """
    barcode=${reads.simpleName}
    fastqc -q $reads
    fastp -i $reads -q 8 -l ${params.min_read_length} --length_limit ${params.max_read_length} -o \$barcode\\_qced_reads.fastq -h \$barcode\\_fastp.html
    reads_count=\$(grep 'runid' \$barcode\\_qced_reads.fastq | wc -l)
    """
}

if(params.remove_unclassified){
    process remove_unclassified {

        input: 
        tuple val(barcode), file(qced_reads) from qc_results

        output: 
        tuple val(barcode), file("*classified.fastq") into for_subsetting_ch

        script:
        kraken2_db=params.db
        """
        kraken2 --db $kraken2_db --report kraken2_consensus_classification.csv --output classification_out.tsv --classified-out ${barcode}_classified.fastq $qced_reads
        """
    }
}
else {
    for_subsetting_ch = qc_results.take(-1)
}

process subset_reads {

    input: 
    tuple val(barcode), file(classified_reads) from for_subsetting_ch

    output:
    tuple val(barcode), file("*subset.fastq") into for_clustering_ch

    script:
    """
    head -n\$(( ${params.umap_set_size}*4 )) $classified_reads > ${barcode}_subset.fastq
    fastqc -q ${barcode}_subset.fastq
    """
}

if(params.multiqc){
    process multiqc {
        publishDir "${params.outdir}/MultiQC", mode: 'copy'

        input:
        file ('fastqc/*') from fastqc_results.collect().ifEmpty([])
        
        output:
        file "*multiqc_report.html"
        file "*_data"

        script:
        """
        multiqc . 
        """
    }
}

process kmer_freqs {

    input:
    tuple val(barcode), file(reads) from for_clustering_ch

    output:
    file "freqs.txt" into freqs
    tuple val(barcode), file(reads) into freqs_qc_results

    script:   
    """
    kmer_freq.py -r $reads > freqs.txt
    """

}

process read_clustering {
    label (params.throughput == 'high' ? 'high_sensitivity': params.throughput == 'low' ? 'low_resource' : 'standard')
    publishDir "${params.outdir}/${barcode}/", mode: 'copy', pattern: 'hdbscan.output.*'

    input:
    file(kmer_freqs) from freqs
    tuple val(barcode), file(qced_reads) from freqs_qc_results

    output:
    tuple val(barcode), file('hdbscan.output.tsv'), file(qced_reads) into clustering_out
    file('*.png')

    script:
    template "umap_hdbscan.py"
}

process split_by_cluster {
    input:
    tuple val(barcode), file(clusters), file(qced_reads) from clustering_out

    output:
    tuple val(barcode), file('*[0-9]*.log'), file('*[0-9]*.fastq') optional true into cluster_reads mode flatten

    script:
    """
    sed 's/\\srunid.*//g' $qced_reads > only_id_header_readfile.fastq
    CLUSTERS=\$(awk '(\$5 ~ /[0-9]/) {print \$5}' $clusters | sort -nr | uniq )
    CLUSTERS_CNT=\$(echo \$CLUSTERS | awk '{print \$1}')

    for ((i = 0 ; i <= \$CLUSTERS_CNT ; i++));
    do
    cluster_id=\$i
    awk -v cluster="\$cluster_id" '(\$5 == cluster) {print \$1}' $clusters > \$cluster_id\\_ids.txt
    seqtk subseq only_id_header_readfile.fastq \$cluster_id\\_ids.txt > \$cluster_id.fastq
    READ_COUNT=\$(( \$(awk '{print \$1/4}' <(wc -l \$cluster_id.fastq)) ))
    echo -n "\$cluster_id;\$READ_COUNT" > \$cluster_id.log
    done
    """
}

process read_correction {

    input:
    tuple val(barcode), file(cluster_log), file(reads) from cluster_reads

    output:
    tuple val(barcode), val(cluster_id), file('*_racon_.log'), file('corrected_reads.correctedReads.fasta') into corrected_reads

    script:
    count=params.polishing_reads
    cluster_id=cluster_log.baseName
    """
    head -n\$(( $count*4 )) $reads > subset.fastq
    canu -correct -p corrected_reads -nanopore-raw subset.fastq genomeSize=${params.avg_amplicon_size} stopOnLowCoverage=${params.stopOnLowCoverage} minInputCoverage=${params.minInputCoverage} minReadLength=${params.minReadLength} minOverlapLength=${params.minOverlapLength} useGrid=${params.useGrid}
    if grep "Found 0 reads" corrected_reads.report
    then
        echo "Canu read correction has failed and the sample will be discontinued"
        exit 84
    fi
    gunzip corrected_reads.correctedReads.fasta.gz
    READ_COUNT=\$(( \$(awk '{print \$1/2}' <(wc -l corrected_reads.correctedReads.fasta)) ))
    cat $cluster_log > ${cluster_id}_racon.log
    echo -n ";$count;\$READ_COUNT;" >> ${cluster_id}_racon.log && cp ${cluster_id}_racon.log ${cluster_id}_racon_.log
    """
}

process draft_selection {
    publishDir "${params.outdir}/${barcode}/cluster${cluster_id}", mode: 'copy', pattern: 'draft_read.fasta'

    input:
    tuple val(barcode), val(cluster_id), file(cluster_log), file(reads) from corrected_reads

    output:
    tuple val(barcode), val(cluster_id), file('*_draft.log'), file('draft_read.fasta'), file(reads) into draft

    script:
    """
    split -l 2 $reads split_reads
    find split_reads* > read_list.txt

    fastANI --ql read_list.txt --rl read_list.txt -o fastani_output.ani -t 4 -k 16 --fragLen 160

    DRAFT=\$(awk 'NR>1{name[\$1] = \$1; arr[\$1] += \$3; count[\$1] += 1}  END{for (a in arr) {print arr[a] / count[a], name[a] }}' fastani_output.ani | sort -rg | cut -d " " -f2 | head -n1)
    cat \$DRAFT > draft_read.fasta

    if [ -s draft_read.fasta ]; then
        ID=\$(head -n1 draft_read.fasta | sed 's/>//g')
        cat $cluster_log > ${cluster_id}_draft.log
        echo -n \$ID >> ${cluster_id}_draft.log
    else
        exit 73
    fi
"""
}

process racon_pass {

    input:
    tuple val(barcode), val(cluster_id), file(cluster_log), file(draft_read), file(corrected_reads) from draft

    output:
    tuple val(barcode), val(cluster_id), file(cluster_log), file('racon_consensus.fasta'), file(corrected_reads), env(success) into racon_output

    script:
    """
    success=1
    minimap2 -ax map-ont --no-long-join -r100 -a $draft_read $corrected_reads -o aligned.sam
    if racon --quality-threshold=9 -w 250 $corrected_reads aligned.sam $draft_read > racon_consensus.fasta ; then
        success=1
    else
        success=0
        cat $draft_read > racon_consensus.fasta
    fi

    if [ -s racon_consensus.fasta ]; then
        success=1
    else
        cat $draft_read > racon_consensus.fasta
        success=0
    fi
    """
}

process medaka_pass {

    publishDir "${params.outdir}/${barcode}/cluster${cluster_id}", mode: 'copy', pattern: 'consensus_medaka.fasta/consensus.fasta' 

    input:
    tuple val(barcode), val(cluster_id), file(cluster_log), file(draft), file(corrected_reads), val(success) from racon_output

    output:
    tuple val(barcode), val(cluster_id), file(cluster_log), file('consensus_medaka.fasta/consensus.fasta') into final_consensus

    script:
    if(success == "0"){
    log.warn """Sample $barcode : Racon correction for cluster $cluster_id failed due to not enough overlaps. Taking draft read as consensus"""
    racon_warnings.add("""Sample $barcode : Racon correction for cluster $cluster_id failed due to not enough overlaps. Taking draft read as consensus""")
    }
    /*
    for some reason including the model was causing medaka to fail -m r941_min_high_g303 - so removed
    */

    """
    if medaka_consensus -i $corrected_reads -d $draft -o consensus_medaka.fasta -t 4 ; then
    echo "Command succeeded"
    else
    cat $draft > consensus_medaka.fasta
    fi
    """

}

process consensus_classification {
    publishDir "${params.outdir}/${barcode}/cluster${cluster_id}", mode: 'copy'
    /*
    time '3m'
    errorStrategy { sleep(1000); return 'retry' }
    maxRetries 5
    */

    input:
    tuple val(barcode), val(cluster_id), file(cluster_log), file(consensus) from final_consensus

    output:
    file('*consensus_classification.csv')
    file('classification_out.tsv') optional true
    tuple val(barcode), file('*_classification.log') into classifications_ch

    script:
    if(params.classification=='full'){
        kraken2_db=params.db
        seqmatch_db=params.db2
        seqmatch_accession=params.accession
        blast_db=params.blast_db
        """
        echo "chosen classification: full"
        echo "classifying with kraken2"
        kraken2 --db $kraken2_db --report kraken2_consensus_classification.csv --output classification_out.tsv $consensus
        KR_OUT=\$(sed 's/\t/;/g' kraken2_consensus_classification.csv | tr -s ' ' | sed 's/; /;/g' | cut -d ';' -f3,4,5,6 | grep -v '^0' | awk 'BEGIN {FS=";"; OFS=";"} {print \$4, \$3, \$2}')
        
        echo "classifying with seqmatch"
        SequenceMatch seqmatch -k 5 $seqmatch_db $consensus | cut -f2,4 | sort | join -t \$'\t' -1 1 -2 1 -o 2.3,2.5,1.2 - $seqmatch_accession | sort -k3 -n -r -t '\t' | sed 's/\t/;/g' > seqmatch_consensus_classification.csv
        if [ -s seqmatch_consensus_classification.csv ]; then
            echo "success"
        else
            echo "unclassified;0;0" >> seqmatch_consensus_classification.csv
        fi
        SEQ_OUT=\$(head -n1 seqmatch_consensus_classification.csv)

        echo "classifying with blastn"
        export BLASTDB=\$(dirname $blast_db)
        blastn -query $consensus -db \$(basename $blast_db) -task megablast -dust no -outfmt "10 sscinames staxids evalue length pident bitscore" -evalue 11 -max_hsps 50 -max_target_seqs 5 | sed 's/,/;/g' > blastn_consensus_classification.csv
        if [ -s blastn_consensus_classification.csv ]; then
            echo "success"
        else
            echo "unclassified;0;0" >> blastn_consensus_classification.csv
        fi
        BLAST_OUT=\$(cut -d";" -f1,2,5 blastn_consensus_classification.csv | head -n1)

        FULL_OUT="\${KR_OUT}\n\${SEQ_OUT}\n\${BLAST_OUT}"

        cat $cluster_log > ${cluster_id}_classification.log
        echo -n ";" >> ${cluster_id}_classification.log
        echo \$KR_OUT >> ${cluster_id}_classification.log
        echo \$SEQ_OUT >> ${cluster_id}_classification.log
        echo \$BLAST_OUT >> ${cluster_id}_classification.log
        """
    }
    else if(params.classification=='seqmatch'){
        db=params.db
        accession=params.accession
        """
        echo "chosen classification: seqmatch"
        SequenceMatch seqmatch -k 5 $db $consensus | cut -f2,4 | sort | join -t \$'\t' -1 1 -2 1 -o 2.3,2.5,1.2 - $accession | sort -k3 -n -r -t '\t' | sed 's/\t/;/g' > consensus_classification.csv
        cat $cluster_log > ${cluster_id}_classification.log
        echo -n ";" >> ${cluster_id}_classification.log
        SEQ_OUT=\$(head -n1 consensus_classification.csv)
        echo \$SEQ_OUT >> ${cluster_id}_classification.log
        """
    }
    else if(params.classification=='kraken2'){
        db=params.db
        if(params.reclassifyOnFail){
            accession=params.accession
            db2=params.db2
            """
            echo "chosen classification: kraken2"
            kraken2 --db $db --report consensus_classification.csv --output classification_out.tsv $consensus
            CLASS_LVL=\$(cut -f4 consensus_classification.csv | tail -n1)
            echo \$CLASS_LVL
            if [[ \$CLASS_LVL != "S"* ]]
            then
                echo "reclassifying"
                SequenceMatch seqmatch -k 5 $db2 $consensus | cut -f2,4 | sort | join -t \$'\t' -1 1 -2 1 -o 2.3,2.5,1.2 - $accession | sort -k3 -n -r -t '\t' | sed 's/\t/;/g' > consensus_classification.csv
                cat $cluster_log > ${cluster_id}_classification.log
                echo -n ";" >> ${cluster_id}_classification.log
                SEQ_OUT=\$(head -n1 consensus_classification.csv)
                echo \$SEQ_OUT >> ${cluster_id}_classification.log
                echo ${params.classification}
            else
                cat $cluster_log > ${cluster_id}_classification.log
                echo -n ";" >> ${cluster_id}_classification.log
                KR_OUT=\$(sed 's/\t/;/g' consensus_classification.csv | tr -s ' ' | sed 's/; /;/g' | cut -d ';' -f3,4,5,6 | grep -v '^0' | awk 'BEGIN {FS=";"; OFS=";"} {print \$4, \$3, \$2}')
                echo \$KR_OUT >> ${cluster_id}_classification.log
            fi
            """
        }
        else {
            """
            echo "chosen classification: kraken2"
            kraken2 --db $db --report consensus_classification.csv --output classification_out.tsv $consensus
            cat $cluster_log > ${cluster_id}_classification.log
            echo -n ";" >> ${cluster_id}_classification.log
            KR_OUT=\$(sed 's/\t/;/g' consensus_classification.csv | tr -s ' ' | sed 's/; /;/g' | cut -d ';' -f3,4,5,6 | grep -v '^0' | awk 'BEGIN {FS=";"; OFS=";"} {print \$4, \$3, \$2}')
            echo \$KR_OUT >> ${cluster_id}_classification.log
            """
        }
    }
    else if(params.classification=='blast'){
        db = resolve_blast_db_path(params.db)
        taxdb = resolve_blast_db_path(params.tax)

        if(workflow.profile == 'conda' || workflow.profile == 'test,conda'){
            blast_dir = "$baseDir/"
        }
        else {
            blast_dir = "/tmp/"
        }

        db=params.db
        taxdb=blast_dir + params.tax
        if(!params.db){
            """
            echo "chosen classification: blast"
            blastn -query $consensus -db nr -remote -entrez_query "Bacteria [Organism]" -task blastn -dust no -outfmt "10 staxids sscinames evalue length score pident" -evalue 11 -max_hsps 50 -max_target_seqs 5 > consensus_classification.csv
            cat $cluster_log > ${cluster_id}_classification.log
            echo -n ";" >> ${cluster_id}_classification.log
            BLAST_OUT=\$(cut -d";" -f1,2,4,5 consensus_classification.csv | head -n1)
            echo \$BLAST_OUT >> ${cluster_id}_classification.log
            """
        }
        else {
            """
            echo "chosen classification: blast"
            export BLASTDB=
            export BLASTDB=\$BLASTDB:$taxdb
            blastn -query $consensus -db $db -task blastn -dust no -outfmt "10 sscinames staxids evalue length pident" -evalue 11 -max_hsps 50 -max_target_seqs 5 | sed 's/,/;/g' > consensus_classification.csv
            #DECIDE FINAL CLASSIFFICATION
            cat $cluster_log > ${cluster_id}_classification.log
            echo -n ";" >> ${cluster_id}_classification.log
            BLAST_OUT=\$(cut -d";" -f1,2,4,5 consensus_classification.csv | head -n1)
            echo \$BLAST_OUT >> ${cluster_id}_classification.log
            """
        }
    }
}

process join_results {
    publishDir "${params.outdir}/${barcode}", mode: 'copy'

    input:
    tuple val(barcode), file(logs) from classifications_ch.groupTuple()

    output:
    tuple val(barcode), file('*.nanoclust_out.txt') into output_table_ch

    script:
    if(params.classification=='full'){
        tax=params.tax
        """
        echo "chosen classification: full"
        echo "id;reads_in_cluster;used_for_consensus;reads_after_corr;draft_id;kraken2_sciname;taxid;class_level;name;species;genus;family;order;seqmatch_sciname;taxid;class_level;name;species;genus;family;order;blast_sciname;taxid;class_level;name;species;genus;family;order;" > ${barcode}.nanoclust_out.txt
        for i in $logs; do
            while read line; do
                echo \$line
                TAXID=\$(echo \$line | awk -F';' '{print \$(NF-1)}')
                echo \$TAXID
                TAXinDB=\$(grep -w "^\${TAXID}" $tax || [[ \$? == 1 ]])
                echo \$TAXinDB
                echo -n \$(echo \$line | tr -d '\n') >> ${barcode}.nanoclust_out.txt
                if [ "\$TAXID" != "0" ] | [ "\$TAXID" != "" ] | [ "\$TAXinDB" != "" ]; then
                    echo -n ";" >> ${barcode}.nanoclust_out.txt
                    TAXONOMY=\$(grep -w "^\${TAXID}" $tax | tr -d '\t' | cut -d '|' -f2,3,4,5,6 --output-delimiter ';')
                    echo -n "\$TAXONOMY;" >> ${barcode}.nanoclust_out.txt
                else
                    echo -n ";;;;;;" >> ${barcode}.nanoclust_out.txt
                fi
            done <\$i
            echo -e "\n" >> ${barcode}.nanoclust_out.txt
        done
        sed -i 's/.\$//' ${barcode}.nanoclust_out.txt
        """
    }
    else if(params.classification=='blast'){
        """
        echo "chosen classification: blast"
        echo "id;reads_in_cluster;used_for_consensus;reads_after_corr;draft_id;sciname;taxid;length;per_ident" > ${barcode}.nanoclust_out.txt

        for i in $logs; do
            cat \$i >> ${barcode}.nanoclust_out.txt
        done
        """
    }
    else if(params.classification=='seqmatch'){
        tax=params.tax
        """
        echo "chosen classification: seqmatch"
        echo "id;reads_in_cluster;used_for_consensus;reads_after_corr;draft_id;sciname;taxid;seqmatch_score;name;species;genus;family;order" > ${barcode}.nanoclust_out.txt

        for i in $logs; do
            TAXID=\$(cut -d ";" -f7 \$i)
            TAXinDB=\$(grep -w "^\${TAXID}" $tax || [[ \$? == 1 ]])
            cat \$i | tr -d '\n' >> ${barcode}.nanoclust_out.txt
            if [ "\$TAXID" != "0" ] | [ "\$TAXID" != "" ] | [ "\$TAXinDB" != "" ]; then
                echo -n ";" >> ${barcode}.nanoclust_out.txt
                TAXONOMY=\$(grep -w "^\${TAXID}" $tax | tr -d '\t' | cut -d '|' -f2,3,4,5,6 --output-delimiter ';')
                echo "\$TAXONOMY" >> ${barcode}.nanoclust_out.txt
            else
                echo ";;;;" >> ${barcode}.nanoclust_out.txt
            fi
        done
        """
    }
    else if(params.classification=='kraken2'){
        tax=params.tax
        """
        echo "chosen classification: kraken2"
        echo "id;reads_in_cluster;used_for_consensus;reads_after_corr;draft_id;sciname;taxid;class_level;name;species;genus;family;order" > ${barcode}.nanoclust_out.txt

        for i in $logs; do
            TAXID=\$(cut -d ";" -f7 \$i)
            TAXinDB=\$(grep -w "^\${TAXID}" $tax || [[ \$? == 1 ]])
            cat \$i | tr -d '\n' >> ${barcode}.nanoclust_out.txt
            if [ "\$TAXID" != "0" ] | [ "\$TAXID" != "" ] | [ "\$TAXinDB" != "" ]; then
                echo -n ";" >> ${barcode}.nanoclust_out.txt
                TAXONOMY=\$(grep -w "^\${TAXID}" $tax | tr -d '\t' | cut -d '|' -f2,3,4,5,6 --output-delimiter ';')
                echo "\$TAXONOMY" >> ${barcode}.nanoclust_out.txt
            else
                echo ";;;;" >> ${barcode}.nanoclust_out.txt
            fi
        done
        """
    }
}

process get_abundances {
    publishDir "${params.outdir}/${barcode}", mode: 'copy'

    input:
    tuple val(barcode), file(table) from output_table_ch

    output:
    tuple val(barcode), file('*.csv') into abundance_table_ch mode flatten
    tuple val(barcode), file('*_S.csv') into final_counts_ch
    file('*_S.csv') into process_metadata_ch

    script:
    template "get_abundance.py"
}


process plot_abundances {
    publishDir "${params.outdir}/${barcode}", mode: 'copy'

    input:
    tuple val(barcode), file(table) from abundance_table_ch

    output:
    file("*.png")

    script:
    template "plot_abundances_pool.py"
}

if(!params.onGridIon){
        Channel.from(params.kit, 'unknown').set{barcoding_kit}
    }

if(params.generateReports){

    process collect_metadata {
        input:
        file(table) from process_metadata_ch.collect()
        file(samplesheet) from samplesheet_ch

        output:
        file("*_control.csv") into collected_metadata_ch optional true
        file("barcodes.csv") into samplsheet_csv_ch

        script:
        info_file=params.experimentInfo

        """
        process_metadata.py --metatable ${samplesheet}
        """
        
    }

    process generate_reports {
        publishDir "${params.outdir}/patient_reports", mode: 'copy'

        input:
        tuple val(barcode), file(table), val(reads_count) from samplsheet_csv_ch.splitCsv().flatten().join(final_counts_ch.join(reads_count_ch), remainder: true)
        file(controls) from collected_metadata_ch.collect()
        tuple val(kit), val(run_id), val(seq_start) from barcoding_kit.collect()

        output:
        file('*.html') into reports_ch mode flatten

        script:
        info_file=params.experimentInfo
        revision=workflow.revision
        clustering_size=params.umap_set_size
        report_template="$baseDir/assets/UoS_report_template.html"
        logo="$baseDir/assets/UoS_white_logo.txt"
        """
        echo ${barcode}
        echo ${table}
        echo ${reads_count}
        results_report.py \
            --infile ${table} \
            --output patient_report \
            --barcode ${barcode} \
            --info $info_file \
            --demux 'Guppy 6.4.6' \
            --clustering_size $clustering_size \
            --controls ${controls} \
            --reads_count ${reads_count} \
            --kit ${kit} \
            --report_template ${report_template} \
            --logo ${logo} \
            --run_id ${run_id} \
            --seq_start "${seq_start}"
        """
    }
}

process output_documentation {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy'

    input:
    file output_docs from ch_output_docs

    output:
    file "results_description.html"

    script:
    """
    markdown_to_html.py $output_docs -o results_description.html
    """
}

/*
 * Completion e-mail notification
 */
workflow.onComplete {

    if (params.generateReports && params.email) {
        def msg1 = """\
            Pipeline execution summary
            ---------------------------
            Completed at: ${workflow.complete}
            Duration    : ${workflow.duration}
            Success     : ${workflow.success}
            workDir     : ${workflow.workDir}
            exit status : ${workflow.exitStatus}
            """
            .stripIndent()

        def currentdir = new File("$params.outdir/patient_reports")
        def files = []
        currentdir.eachFile(FileType.FILES) {
            files << it.path
        }

        sendMail(to: params.email, subject: "Patient Reports", body: msg1, attach: files)
    }

    // Set up the e-mail variables
    def subject = "[nf-core/nanoclust] Successful: $workflow.runName"
    if (!workflow.success) {
      subject = "[nf-core/nanoclust] FAILED: $workflow.runName"
    }
    def email_fields = [:]
    email_fields['version'] = workflow.manifest.version
    email_fields['runName'] = custom_runName ?: workflow.runName
    email_fields['success'] = workflow.success
    email_fields['dateComplete'] = workflow.complete
    email_fields['duration'] = workflow.duration
    email_fields['exitStatus'] = workflow.exitStatus
    email_fields['errorMessage'] = (workflow.errorMessage ?: 'None')
    email_fields['errorReport'] = (workflow.errorReport ?: 'None')
    email_fields['commandLine'] = workflow.commandLine
    email_fields['projectDir'] = workflow.projectDir
    email_fields['summary'] = summary
    email_fields['summary']['Date Started'] = workflow.start
    email_fields['summary']['Date Completed'] = workflow.complete
    email_fields['summary']['Pipeline script file path'] = workflow.scriptFile
    email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId
    if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository
    if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId
    if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision
    if (workflow.container) email_fields['summary']['Docker image'] = workflow.container
    email_fields['summary']['Nextflow Version'] = workflow.nextflow.version
    email_fields['summary']['Nextflow Build'] = workflow.nextflow.build
    email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp

    // TODO nf-core: If not using MultiQC, strip out this code (including params.maxMultiqcEmailFileSize)
    // On success try attach the multiqc report
    def mqc_report = null
    try {
        if (workflow.success) {
            mqc_report = multiqc_report.getVal()
            if (mqc_report.getClass() == ArrayList) {
                log.warn "[nf-core/nanoclust] Found multiple reports from process 'multiqc', will use only one"
                mqc_report = mqc_report[0]
            }
        }
    } catch (all) {
        log.warn "[nf-core/nanoclust] Could not attach MultiQC report to summary email"
    }

    // Check if we are only sending emails on failure
    email_address = params.email
    if (params.email_on_fail && !workflow.success) {
        email_address = params.email_on_fail
    } 

    if (params.email_on_fail && workflow.success) {
        email_address = false
    }

    // Render the TXT template
    def engine = new groovy.text.GStringTemplateEngine()
    def tf = new File("$baseDir/assets/email_template.txt")
    def txt_template = engine.createTemplate(tf).make(email_fields)
    def email_txt = txt_template.toString()

    // Render the HTML template
    def hf = new File("$baseDir/assets/email_template.html")
    def html_template = engine.createTemplate(hf).make(email_fields)
    def email_html = html_template.toString()

    // Render the sendmail template
    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.maxMultiqcEmailFileSize.toBytes() ]
    def sf = new File("$baseDir/assets/sendmail_template.txt")
    def sendmail_template = engine.createTemplate(sf).make(smail_fields)
    def sendmail_html = sendmail_template.toString()

    // Send the HTML e-mail
    if (email_address) {
        try {
            if ( params.plaintext_email ){ throw GroovyException('Send plaintext e-mail, not HTML') }
            // Try to send HTML e-mail using sendmail
            [ 'sendmail', '-t' ].execute() << sendmail_html
            log.info "[nf-core/nanoclust] Sent summary e-mail to $email_address (sendmail)"
        } catch (all) {
            // Catch failures and try with plaintext
            [ 'mail', '-s', subject, email_address ].execute() << email_txt
            log.info "[nf-core/nanoclust] Sent summary e-mail to $email_address (mail)"
        }
    }

    // Write summary e-mail HTML to a file
    def output_d = new File( "${params.outdir}/pipeline_info/" )
    if (!output_d.exists()) {
      output_d.mkdirs()
    }
    def output_hf = new File( output_d, "pipeline_report.html" )
    output_hf.withWriter { w -> w << email_html }
    def output_tf = new File( output_d, "pipeline_report.txt" )
    output_tf.withWriter { w -> w << email_txt }

    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_red = params.monochrome_logs ? '' : "\033[0;31m";

    if (workflow.stats.ignoredCount > 0 && workflow.success) {
        log.info "${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}"
        log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}"
        log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}"
    }

    if (workflow.success) {
        log.info "${c_purple}[nf-core/nanoclust]${c_green} Pipeline completed successfully${c_reset}"
        if(!racon_warnings.isEmpty()){
            racon_warnings.each{log.warn "$it"}
        }
    } else {
        checkHostname()
        log.info "${c_purple}[nf-core/nanoclust]${c_red} Pipeline completed with errors${c_reset}"
    }

}


def nfcoreHeader(){
    // Log colors ANSI codes
    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_dim = params.monochrome_logs ? '' : "\033[2m";
    c_black = params.monochrome_logs ? '' : "\033[0;30m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_yellow = params.monochrome_logs ? '' : "\033[0;33m";
    c_blue = params.monochrome_logs ? '' : "\033[0;34m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_cyan = params.monochrome_logs ? '' : "\033[0;36m";
    c_white = params.monochrome_logs ? '' : "\033[0;37m";
    c_red = params.monochrome_logs ? '' : "\033[0;31m";

    return """   
    -${c_dim}--------------------------------------------------${c_reset}-
    ${c_green}      _   __                 ${c_red}    ________    __  _____________${c_reset}
    ${c_green}     / | / /___ _____  ____  ${c_red}   / ____/ /   / / / / ___/_  __/${c_reset}
    ${c_green}    /  |/ / __ `/ __ \\/ __ \\ ${c_red}  / /   / /   / / / /\\__ \\ / /   ${c_reset}
    ${c_green}   / /|  / /_/ / / / / /_/ / ${c_red} / /___/ /___/ /_/ /___/ // /    ${c_reset}
    ${c_green}  /_/ |_/\\__,_/_/ /_/\\____/  ${c_red} \\____/_____/\\____//____//_/     ${c_reset}

    ${c_purple}  NanoCLUST v${workflow.manifest.version}${c_reset}
    -${c_dim}--------------------------------------------------${c_reset}-
    """.stripIndent()
}

def checkHostname(){
    def c_reset = params.monochrome_logs ? '' : "\033[0m"
    def c_white = params.monochrome_logs ? '' : "\033[0;37m"
    def c_red = params.monochrome_logs ? '' : "\033[1;91m"
    def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m"
    if (params.hostnames) {
        def hostname = "hostname".execute().text.trim()
        params.hostnames.each { prof, hnames ->
            hnames.each { hname ->
                if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
                    log.error "====================================================\n" +
                            "  ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" +
                            "  but your machine hostname is ${c_white}'$hostname'${c_reset}\n" +
                            "  ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" +
                            "============================================================"
                }
            }
        }
    }
}