diff --git a/modules/nf-core/samtools/bgzip/environment.yml b/modules/nf-core/samtools/bgzip/environment.yml new file mode 100644 index 00000000000..39e8b277518 --- /dev/null +++ b/modules/nf-core/samtools/bgzip/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::samtools=1.21 + - conda-forge::file=5.46 diff --git a/modules/nf-core/samtools/bgzip/main.nf b/modules/nf-core/samtools/bgzip/main.nf new file mode 100644 index 00000000000..d66db959c63 --- /dev/null +++ b/modules/nf-core/samtools/bgzip/main.nf @@ -0,0 +1,72 @@ +process SAMTOOLS_BGZIP { + tag "$fasta" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/61/61be440cd54169fcefe5303238847dd466728453a130f4fbc5abd68b514f0b09/data' : + 'community.wave.seqera.io/library/samtools_file:f35ca613dd912bed' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.bgzip.fa.gz") , emit: fa + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + COMPRESS_TYPE=\$(file -L -b $fasta) + + case "\$COMPRESS_TYPE" in + "gzip compressed data, extra field"*) + # A well-behaved find installation should report: + # Blocked GNU Zip Format (BGZF; gzip compatible) + # But the one in anaconda does nt… + # Assuming the "extra field" implies BGZF, do nothing. + ln -s $fasta ${prefix}.bgzip.fa.gz + ;; + gzip*) + # Recompress non-BGZF gzipped files + zcat $fasta | + bgzip \\ + $args \\ + --threads ${task.cpu} \\ + > ${prefix}.bgzip.fa.gz + ;; + *) + # Compress + bgzip \\ + $args \\ + --threads ${task.cpu} \\ + --stdout \\ + $fasta \\ + > ${prefix}.bgzip.fa.gz + ;; + esac + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//')) + samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//')) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo '' | bgzip > ${prefix}.bgzip.fa.gz + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//')) + samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/bgzip/meta.yml b/modules/nf-core/samtools/bgzip/meta.yml new file mode 100644 index 00000000000..51377c49e81 --- /dev/null +++ b/modules/nf-core/samtools/bgzip/meta.yml @@ -0,0 +1,59 @@ +name: samtools_bgzip +description: Produce a FASTA file compressed with the BGZF algorithm +keywords: + - fasta + - BGZF + - bgzip +tools: + - file: + description: | + The file command is "a file type guesser", that is, a command-line tool that tells you + in words what kind of data a file contains. Unlike most GUI systems, command-line UNIX + systems - with this program leading the charge - don't rely on filename extentions to + tell you the type of a file, but look at the file's actual contents. This is, of course, + more reliable, but requires a bit of I/O. + homepage: https://www.darwinsys.com/file/ + documentation: https://manpages.debian.org/bookworm/file/file.1.en.html + licence: ["BSD-2-clause"] + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:samtools +input: + - - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file, compressed or not. + pattern: "*.{fa,fa.gz,fasta,fasta.gz}" +output: + - fa: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bgzip.fa.gz": + type: file + description: | + A FASTA file compressed with the BGZF algorithm. It will be + the original file if it was already BGZF-compressed. + pattern: "*.{bgzip.fa.gz}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/samtools/bgzip/tests/main.nf.test b/modules/nf-core/samtools/bgzip/tests/main.nf.test new file mode 100644 index 00000000000..2d8379cd14e --- /dev/null +++ b/modules/nf-core/samtools/bgzip/tests/main.nf.test @@ -0,0 +1,89 @@ +nextflow_process { + + name "Test Process SAMTOOLS_BGZIP" + script "../main.nf" + process "SAMTOOLS_BGZIP" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/bgzip" + + test("test_samtools_bgzip - fasta") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_bgzip - fasta bgzipped") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_bgzip - proteome gzipped") { + // This file is not bgziped. It is used to check the re-zipping branch of the case statement in the module. + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_bgzip - fasta stub") { + + options "-stub" + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/bgzip/tests/main.nf.test.snap b/modules/nf-core/samtools/bgzip/tests/main.nf.test.snap new file mode 100644 index 00000000000..14f8498b2ac --- /dev/null +++ b/modules/nf-core/samtools/bgzip/tests/main.nf.test.snap @@ -0,0 +1,142 @@ +{ + "test_samtools_bgzip - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:22:43.985034" + }, + "test_samtools_bgzip - fasta bgzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:22:59.160449" + }, + "test_samtools_bgzip - fasta stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:23:28.612494" + }, + "test_samtools_bgzip - proteome gzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d" + ] + ], + "1": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d" + ] + ], + "versions": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:23:13.752005" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/bgzip/tests/tags.yml b/modules/nf-core/samtools/bgzip/tests/tags.yml new file mode 100644 index 00000000000..13e3ed81bbd --- /dev/null +++ b/modules/nf-core/samtools/bgzip/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/bgzip: + - modules/nf-core/samtools/bgzip/**