main.nf

#!/usr/bin/env nextflow

/*
=============================================================
 This section stages the input files:
 1. If you use an input CSV file, using functions, 
    we read in the paths to the paired fastq files. 
    Alternatively, you can provide a path as params.input.
    Once staged in ch_input, the reads are placed in 2
    channels - one for QC, one for (pseudo)alignment.
 
 2. The FASTA and GTF file are staged for the analysis using
    Channel.value(file()) so they can be used multiple times. 
=============================================================
*/

if(has_extension(params.input, ".csv")){
   
   csv_file = file(params.input, checkIfExists: true)
   ch_input = extract_data(csv_file)

}else{

   ch_input = Channel.fromFilePairs(params.input, checkIfExists: true)

}

( ch_qc_reads, ch_raw_reads) = ch_input.into(2)

ch_fasta = Channel.value(file(params.fasta))
ch_gtf = Channel.value(file(params.gtf))


/*
=================================================================
  1. FASTQC
     The first process we will create is FASTQC to check the 
     quality of our sequencing reads. In our simulated data there
     are no contaminants - subsequent adapter trimming is skipped
     Inputs:
     The raw sequencing reads
     Outputs:
     HTML, ZIP files generated by FastQC
=================================================================
*/

process FASTQC{
    tag "${base}"
    publishDir params.outdir, mode: 'copy',
	    saveAs: { params.save_qc_intermediates ? "fastqc/${it}" : null }
		
	when:
    params.run_qc

    input:
    tuple val(base), file(reads) from ch_qc_reads

    output:
    file("*.{html,zip}") into ch_multiqc

    script:
    """
    fastqc -q $reads
    """
}

/*
================================================================
 2. Generate a Transcriptome file. 
    We are performing pseudoalignment, which is done against the
    transcriptome - not the genome. As such we will subset our 
    ChrI.fa file to contain only sequences that originate from 
    the transcriptome.
    Inputs:
    The genome FASTA file
    The reference GTF file
    Outputs:
    A newly created transcriptome FASTA file
================================================================
*/

process TX{
    publishDir params.outdir, mode: 'copy',
        saveAs: { params.save_transcriptome ? "reference/transcriptome/${it}" : null }

    when:
    !params.transcriptome && params.fasta

    input:
    file(fasta) from ch_fasta
    file(gtf) from ch_gtf

    output:
    file("${fasta.baseName}.tx.fa") into transcriptome_created

    script:
    """
    gffread -F -w "${fasta.baseName}.tx.fa" -g $fasta $gtf
    """
}

/*
=================================================================
 3. Create Transcriptome Index
    As with any alignment, we will need to create an index file 
    of the input transcriptome/genome file for fast access during 
    alignment. 
    Inputs:
    The transcriptome FASTA file generated in the process TX
    Outputs:
    An index file
=================================================================
*/

process INDEX{

    publishDir "${params.outdir}/index", mode: 'copy'

    input:
	file (ref) from transcriptome_created

    output:
    file "chr1_index.idx" into index_created

    script:
    """
    kallisto index -i chr1_index.idx ${ref}
    """
}

process KALLISTO_QUANT{
    publishDir "${params.outdir}/kallisto_quant", mode: 'copy'
	
	input:
	file (index) from index_created
	tuple val(base), file(reads) from ch_raw_reads
	
	output:
	tuple val(base), file("${base}") into kallisto_out
    file("${base}.kallisto.log") into kallisto_logs
	
	script:
	"""
	kallisto quant \
    -i $index \
    -t 2 \
    -o ${base}/ \
    --bias \
    --pseudobam \
    $reads &> ${base}.kallisto.log
	"""
}

process MULTIQC{
    publishDir "${params.outdir}/quality_control/multiqc", mode: 'copy'
	
	when:
    params.run_qc

    input:
    file(htmls) from ch_multiqc.collect()
    file(kallisto_logs) from kallisto_logs.collect()

    output:
    file("*.html") into ch_out

    script:
    """
    multiqc .
    """
}

/*
================================================================================
                            AUXILLARY FUNCTIONS
================================================================================
*/

// Check if a row has the expected number of item
def checkNumberOfItem(row, number) {
    if (row.size() != number) exit 1, "error:  Invalid CSV input - malformed row (e.g. missing column) in ${row}, consult documentation."
    return true
}

// Return file if it exists
def return_file(it) {
    if (!file(it).exists()) exit 1, "error: Cannot find supplied FASTQ input file. Check file: ${it}"
    return file(it)
}

// Check file extension
def has_extension(it, extension) {
    it.toString().toLowerCase().endsWith(extension.toLowerCase())
}

// Parse samples.csv file
def extract_data(csvFile){
    Channel
        .fromPath(csvFile)
        .splitCsv(header: true, sep: ',')
        .map{ row ->

        def expected_keys = ["Sample_ID", "Read1", "Read2"]
        if(!row.keySet().containsAll(expected_keys)) exit 1, "error: Invalid CSV input - malformed column names. Please use the column names 'Sample_ID', 'Read1', 'Read2'."

        checkNumberOfItem(row, 3)

        def samples = row.Sample_ID
        def read1 = row.Read1.matches('NA') ? 'NA' : return_file(row.Read1)
        def read2 = row.Read2.matches('NA') ? 'NA' : return_file(row.Read2)

        if( samples == '' || read1 == '' || read2 == '' ) exit 1, "error: a field does not contain any information. Please check your CSV file"
        if( !has_extension(read1, "fastq.gz") && !has_extension(read1, "fq.gz") && !has_extension(read1, "fastq") && !has_extension(read1, "fq")) exit 1, "error: A R1 file has a non-recognizable FASTQ extension. Check: ${r1}"
        if( !has_extension(read2, "fastq.gz") && !has_extension(read2, "fq.gz") && !has_extension(read2, "fastq") && !has_extension(read2, "fq")) exit 1, "error: A R2 file has a non-recognizable FASTQ extension. Check: ${r2}"

        // output tuple mimicking fromFilePairs
        [ samples, [read1, read2] ]

        }
}

ch_transcriptome = params.transcriptome ? Channel.value(file(params.transcriptome)) : transcriptome_created