diff --git a/CHANGELOG.md b/CHANGELOG.md index a5743a60..56cf4020 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,7 @@ Code contributions to the hotfix: - [Jaime Ozaez](https://github.com/jaimeozaez) - [Sara Monzón](https://github.com/saramonzon) - [Sarai Varona](https://github.com/svarona) +- [Daniel Valle](https://github.com/Daniel-VM) ### Template fixes and updates @@ -67,6 +68,7 @@ Code contributions to the hotfix: - Small changes to `buisciii_tools/bu_isciii/templates/viralrecon/RESULTS/viralrecon_results` for blast and new excel_generator.py - Introduced better error handling in excel_generator.py. Now it can also be used for single files - Brought back `PASS_ONLY` to exometrio's `exomiser_configfile.yml` +- [#187](https://github.com/BU-ISCIII/buisciii-tools/pull/187) - Added new template for bacterial assembly. Allowing for short, long and hybrid assembly. ### Modules diff --git a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog index 9503c3f9..bd8f8549 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog +++ b/bu_isciii/templates/assembly/ANALYSIS/ANALYSIS01_ASSEMBLY/lablog @@ -1,31 +1,97 @@ -echo "Do you want to save trimmed reads in outdir?" +# Function to print colored text +print_color() { + case "$2" in + "red") + echo -e "\e[1;31m$1\e[0m" + ;; + "green") + echo -e "\e[1;32m$1\e[0m" + ;; + "blue") + echo -e "\e[1;34m$1\e[0m" + ;; + *) + echo "$1" + ;; + esac +} -read -p 'Write y or n: ' trimmed +# Function to prompt with color +prompt_with_color() { + read -p "$(print_color $1 'blue') $2" response +} -TRIMMED=$(echo "${trimmed}" | tr '[:upper:]' '[:lower:]') +# Select assembly mode +assembly_options=("short" "long" "hybrid") +print_color "Indicate the preferred assembly mode:" 'blue' +select ASSEMBLY_MODE in "${assembly_options[@]}"; do + if [ -n "$ASSEMBLY_MODE" ]; then + if [ $ASSEMBLY_MODE == "short" ]; then + ASSEMBLER="unicycler" + elif [ "$ASSEMBLY_MODE" == "long" ] || [ "$ASSEMBLY_MODE" == "hybrid" ]; then + ASSEMBLER="dragonflye" + fi + break + else + print_color "Invalid input. Please select a valid option." 'red' + fi +done +print_color "Selected assembly mode: $ASSEMBLY_MODE" 'green' -if [ "$TRIMMED" == "yes" ] || [ "$TRIMMED" == "y" ] -then SAVETRIMMED="True" -else SAVETRIMMED="False" -fi +# Select whether to save trimmed reads +trim_options=("Yes" "No") +print_color "Do you want to save trimmed reads in outdir?" 'blue' +select TRIMMED in "${trim_options[@]}"; do + if [ -n "$TRIMMED" ]; then + # rename trimmed + if [ "$TRIMMED" == "Yes" ] || [ "$TRIMMED" == "y" ]; then + SAVETRIMMED="true" + else + SAVETRIMMED="false" + fi -echo "Is gram positive or negative?" + break + else + print_color "Invalid input. Please select a valid option." 'red' + fi +done +print_color "Selected trimmed file option: $TRIMMED save trimmed" 'green' -read -p 'Write + or -: ' grammtype +# Select Prokka gram type +gram_options=("+" "-" "skip") -if [ "$grammtype" != "-" ] && [ "$grammtype" != "+" ] -then - echo "The given param: $grammtype does not match any of the accepted params ('+' or '-')" - exit 1 -fi +print_color "Is gram positive or negative?" 'blue' +select GRAMTYPE in "${gram_options[@]}"; do + if [ -n "$GRAMTYPE" ]; then + if [ "$GRAMTYPE" != "skip" ]; then + PROKKA_ARGS="--prokka_args '--gram ${GRAMTYPE}'" + fi + break + else + print_color "Invalid input. Please select a valid option." 'red' + fi +done +print_color "Selected Prokka gram type: $GRAMTYPE" 'green' + + +# SETUP INTPUT SAMPLE SHEET ln -s ../00-reads . ln -s ../samples_id.txt . -echo "sample,fastq_1,fastq_2" > samplesheet.csv -cat samples_id.txt | while read in; do echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz"; done >> samplesheet.csv +echo "ID,R1,R2,LongFastQ,Fast5,GenomeSize" > samplesheet.csv +cat samples_id.txt | while read in; do + if [ "$ASSEMBLY_MODE" == "short" ]; then + echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz,NA,NA,NA"; + elif [ "$ASSEMBLY_MODE" == "long" ]; then + echo "${in},NA,NA,00-reads/${in}.fastq.gz,NA,NA"; + elif [ "$ASSEMBLY_MODE" == "hybrid" ]; then + echo "${in},00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz,00-reads/${in}.fastq.gz,NA,NA"; + else + echo "Format not recognized for the sample : ${in}."; + fi +done >> samplesheet.csv -#module load Nextflow/21.10.6 singularity scratch_dir=$(echo $PWD | sed "s/\/data\/bi\/scratch_tmp/\/scratch/g") cat < assembly.sbatch @@ -38,20 +104,28 @@ cat < assembly.sbatch #SBATCH --output $(date '+%Y%m%d')_assembly01.log #SBATCH --chdir $scratch_dir -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /scratch/bi/pipelines/BU_ISCIII-bacterial-assembly/main.nf \\ - -c ../../DOC/hpc_slurm_assembly.config \\ - --input samplesheet.csv \\ - --outdir ./ \\ - --cut_mean_quality 20 \\ - --qualified_quality_phred 20 \\ - --gram ${grammtype} \\ - --reference_outdir ../../REFERENCES \\ - --save_trimmed ${SAVETRIMMED} \\ - --kmerfinder_bacteria_database '/data/bi/references/kmerfinder/20190108_stable_dirs/bacteria' \\ - --reference_ncbi_bacteria '/data/bi/references/bacteria/latest_db/assembly_summary_bacteria.txt' \\ - -resume +# module load Nextflow/23.10.0 singularity +export NXF_OPTS="-Xms500M -Xmx8G" + +nextflow run /data/bi/pipelines/nf-core-bacass/main.nf \\ + -c ../../DOC/hpc_slurm_assembly.config \\ + -profile singularity \\ + --input samplesheet.csv \\ + --outdir ./ \\ + --assembly_type ${ASSEMBLY_MODE} \\ + --assembler ${ASSEMBLER} \\ + --skip_polish true \\ + --save_trimmed ${SAVETRIMMED} \\ + --fastp_args '--qualified_quality_phred 20 --cut_mean_quality 20' \\ + --skip_kraken2 true \\ + --skip_kmerfinder false \\ + --kmerfinderdb /data/bi/references/kmerfinder/20190108_stable_dirs/bacteria \\ + --ncbi_assembly_metadata /data/bi/references/bacteria/20191212/assembly_summary_bacteria.txt \\ + ${PROKKA_ARGS} \\ + -resume + EOF echo "sbatch assembly.sbatch" > _01_nf_assembly.sh + + diff --git a/bu_isciii/templates/assembly/ANALYSIS/lablog_assembly b/bu_isciii/templates/assembly/ANALYSIS/lablog_assembly index bcbdefd6..c5e90e0e 100644 --- a/bu_isciii/templates/assembly/ANALYSIS/lablog_assembly +++ b/bu_isciii/templates/assembly/ANALYSIS/lablog_assembly @@ -1,4 +1,21 @@ mkdir -p 00-reads -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - -mv ANALYSIS01_ASSEMBLY $(date '+%Y%m%d')_ANALYSIS01_ASSEMBLY +cd 00-reads + +# Loop through each file in the directory +while IFS= read -r sample; do + # Extract the file name with&without extension + filename_noext=$(basename -s .fastq.gz ../../RAW/${sample}*) + + ### Check if the file is a short read or long read + for fileitem in $filename_noext; do + if [[ "$fileitem" =~ _R[12] ]]; then + ln -s -f ../../RAW/${sample}*_R1*.fastq.gz ${sample}_R1.fastq.gz + ln -s -f ../../RAW/${sample}*_R2*.fastq.gz ${sample}_R2.fastq.gz + elif [[ ! "$fileitem" =~ _R[12] ]]; then + ln -s -f ../../RAW/${sample}.fastq.gz ${sample}.fastq.gz + fi + done +done < ../samples_id.txt + +cd - +mv ANALYSIS01_ASSEMBLY "$(date '+%Y%m%d')_ANALYSIS01_ASSEMBLY" diff --git a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config index 73bfc79b..04dddf4d 100644 --- a/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config +++ b/bu_isciii/templates/assembly/DOC/hpc_slurm_assembly.config @@ -1,26 +1,231 @@ -conda { - enabled = true - autoMounts = true -} +/* + HPC XTUTATIS CONFIGURATION +*/ singularity { - enabled = true - autoMounts = true + enabled = true + autoMounts = true + singularity.cacheDir = '/data/bi/pipelines/singularity-images' } process { - executor = 'slurm' - queue = 'middle_idx' - conda = '/data/bi/pipelines/miniconda3/envs/assembly' - errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } - maxRetries = 1 + executor = 'slurm' + queue = 'middle_idx' + jobName = { "$task.name - $task.hash" } + conda = null + + errorStrategy = { task.exitStatus in [140,143,137,138,104,134,139] ? 'retry' : 'finish'; task.exitStatus in [1,4,255] ? 'ignore' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + withName:PROKKA { + container = 'https://zenodo.org/records/10496286/files/bioconda_prokka_v1.14.6_signalp_v4.1.sif?download=1' + errorStrategy = { task.exitStatus in [2] ? 'retry' : 'finish'} + maxRetries = 2 maxErrors = '-1' - withName:KMERFINDER { - container = '/scratch/bi/singularity-images/kmerfinder_v3.0.2.sif' - } + } } + params { max_memory = 376.GB max_cpus = 32 max_time = '48.h' } + +/* + CUSTOM OUTPUT FOLDER STRUCTURE -- modules.config +*/ +params { publish_dir_mode = 'copy' } +process { + withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_RAW' { + publishDir = [ + [ + path: { "${params.outdir}/01-processing/fastqc/raw" }, + pattern: "*.{json,html}", + mode: params.publish_dir_mode + ], + [ + path: { "${params.outdir}/01-processing/fastqc/raw/zips" }, + pattern: "*.zip", + mode: params.publish_dir_mode + ] + ] + } + withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTP' { + publishDir = [ + [ + path: { "${params.outdir}/01-processing/fastp" }, + mode: params.publish_dir_mode, + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/01-processing/fastp" }, + mode: params.publish_dir_mode, + pattern: "*.{json,html}" + ], + [ + path: { "${params.outdir}/01-processing/fastp/logs" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ] + ] + } + withName: '.*:.*:FASTQ_TRIM_FASTP_FASTQC:FASTQC_TRIM' { + publishDir = [ + [ + path: { "${params.outdir}/01-processing/fastqc/trim" }, + pattern: "*.{json,html}", + mode: params.publish_dir_mode + ], + [ + path: { "${params.outdir}/01-processing/fastqc/trim/zips" }, + pattern: "*.zip", + mode: params.publish_dir_mode + ] + ] + } + withName: 'NANOPLOT' { + publishDir = [ + path: { "${params.outdir}/01-processing/nanoplot" }, + pattern: "*.txt", + mode: params.publish_dir_mode + ] + } + withName: 'PYCOQC' { + publishDir = [ + path: { "${params.outdir}/01-processing/pycoqc" }, + mode: params.publish_dir_mode + ] + } + withName: 'PORECHOP_PORECHOP' { + publishDir = [ + [ + path: { "${params.outdir}/01-processing/porechop" }, + pattern: "*.fastq.gz", + mode: params.publish_dir_mode, + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/01-processing/porechop/logs" }, + pattern: "*.log", + mode: params.publish_dir_mode, + ] + ] + } + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:FIND_DOWNLOAD_REFERENCE' { + publishDir = [ + path: { "${params.outdir}/../../REFERENCES" }, + pattern: "*.{fna,gff}.gz", + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.equals('versions.yml')){ + null + } else { + "${refmeta.toString().replace(' ', '_')}/${filename}" + } + } + ] + } + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER' { + publishDir = [ + path: { "${params.outdir}/02-taxonomy_contamination/kmerfinder/${meta.id}" }, + mode: params.publish_dir_mode + ] + } + withName: '.*:.*:KMERFINDER_SUBWORKFLOW:KMERFINDER_SUMMARY' { + publishDir = [ + path: { "${params.outdir}/99-stats" }, + mode: params.publish_dir_mode + ] + } + withName: 'KRAKEN2|KRAKEN2_LONG' { + publishDir = [ + path: { "${params.outdir}/02-taxonomy_contamination/kraken2" }, + mode: params.publish_dir_mode + ] + } + withName: 'UNICYCLER|CANU|MINIASM|DRAGONFLYE' { + publishDir = [ + path: { "${params.outdir}/03-assembly/${params.assembler}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith('.scaffolds.fa.gz') || + filename.endsWith('.contigs.fasta.gz') || + filename.endsWith('.contigs.fa') || + filename.endsWith('.fasta.gz')) { + "${meta.id}.fasta.gz" + } else { + null + } + } + ] + } + withName: 'RACON|MEDAKA|NANOPOLISH' { + publishDir = [ + path: { "${params.outdir}/03-assembly/${params.assembler}/${params.polish_method}" }, + mode: params.publish_dir_mode + ] + } + withName: 'QUAST|QUAST_BYREFSEQID' { + publishDir = [ + path: { "${params.outdir}/03-assembly/quast" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.equals('versions.yml') || filename.endsWith('.tsv')){ + null + } else if (filename.startsWith('GCF')){ + "per_reference_reports/${filename}" + } + else if (!filename.startsWith('GCF')) { + "global_${filename}" + } + } + ] + } + withName: 'PROKKA' { + ext.args = { + [ + '--force', + params.prokka_args ? "${params.prokka_args}" : '' + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/05-annotation/prokka" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'BAKTA_BAKTA' { + ext.args = { + [ + '--force', + params.bakta_args ? "${params.bakta_args}" : '' + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/05-annotation/bakta/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'MULTIQC' { + publishDir = [ + [ + path: { "${params.outdir}/99-stats/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.equals('versions.yml') || filename.endsWith('.csv')) { + null + } else { + filename + } + } + ], + [ + path: { "${params.outdir}/99-stats" }, + mode: params.publish_dir_mode, + pattern: "*.csv" + ] + ] + } +} diff --git a/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results b/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results index f112c0b5..508d1d55 100644 --- a/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results +++ b/bu_isciii/templates/assembly/RESULTS/lablog_assembly_results @@ -1,12 +1,32 @@ DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" mkdir $DELIVERY_FOLDER -mkdir "${DELIVERY_FOLDER}/assembly" +mkdir $DELIVERY_FOLDER/assembly # Assembly service cd $DELIVERY_FOLDER/assembly -ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/MultiQC/multiqc_report.html . -ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/kmerfinder.csv . -ln -s ../../../ANALYSIS/*ASSEMBLY/03-assembly/unicycler assemblies -ln -s ../../../ANALYSIS/*ASSEMBLY/03-assembly/quast_results/latest*/report.html quast_report.html + +# Links to reports +ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/multiqc/multiqc_report.html . +ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/summary_assembly_metrics_mqc.csv . +ln -s ../../../ANALYSIS/*ASSEMBLY/99-stats/kmerfinder_summary.csv . +ln -s ../../../ANALYSIS/*ASSEMBLY/03-assembly/quast/global_report/report.html quast_global_report.html + +# Links to per reference reports +for dir in ../../../ANALYSIS/*ASSEMBLY/03-assembly/quast/per_reference_reports/*; do + base=$(basename "$dir") + if compgen -G "$dir" > /dev/null; then + ln -s "$dir/report.html" "quast_${base}_report.html" + fi +done + +# Links to assemblies +assembly_dirs=(unicycler dragonflye canu miniasm) +for tool in "${assembly_dirs[@]}"; do + path="../../../ANALYSIS/*ASSEMBLY/03-assembly/${tool}" + if compgen -G "$path" > /dev/null; then + find $path -type d -exec ln -nsf {} assemblies \; + break + fi +done cd - diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index 2d8ac4e1..255531b9 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -6,7 +6,7 @@ "order": 1, "begin": "", "end": "", - "description": "Nextflow assembly pipeline to assemble bacterial genomes", + "description": "nf-core/bacass: Simple bacterial assembly and annotation pipeline", "clean": { "folders":["01-preprocessing/trimmed_sequences"], "files":[]