From 6c9d861fae460dc79b498f9472340c75cf9d081f Mon Sep 17 00:00:00 2001 From: Charles Plessy Date: Wed, 5 Feb 2025 10:56:22 +0900 Subject: [PATCH] New `samtools/bgzip` module to produced BGZF-compressed files. This module can take a FASTA file regardless if it is gzipped or not. - If the file was not compressed, it is compressed with bgzip. - If the file was compressed with gzip, it uncompressed and recompressed with bgzip. - If the file was already compressed with bgzip, the module returns it unchanged. This solved the issue that some pipelines request input data to be already bgzipped, but data providers like NCBI do not use bgzip for compressing their files (in particular genomes). --- .../nf-core/samtools/bgzip/environment.yml | 7 + modules/nf-core/samtools/bgzip/main.nf | 72 +++++++++ modules/nf-core/samtools/bgzip/meta.yml | 59 ++++++++ .../nf-core/samtools/bgzip/tests/main.nf.test | 89 +++++++++++ .../samtools/bgzip/tests/main.nf.test.snap | 142 ++++++++++++++++++ modules/nf-core/samtools/bgzip/tests/tags.yml | 2 + 6 files changed, 371 insertions(+) create mode 100644 modules/nf-core/samtools/bgzip/environment.yml create mode 100644 modules/nf-core/samtools/bgzip/main.nf create mode 100644 modules/nf-core/samtools/bgzip/meta.yml create mode 100644 modules/nf-core/samtools/bgzip/tests/main.nf.test create mode 100644 modules/nf-core/samtools/bgzip/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/bgzip/tests/tags.yml diff --git a/modules/nf-core/samtools/bgzip/environment.yml b/modules/nf-core/samtools/bgzip/environment.yml new file mode 100644 index 00000000000..39e8b277518 --- /dev/null +++ b/modules/nf-core/samtools/bgzip/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::samtools=1.21 + - conda-forge::file=5.46 diff --git a/modules/nf-core/samtools/bgzip/main.nf b/modules/nf-core/samtools/bgzip/main.nf new file mode 100644 index 00000000000..d66db959c63 --- /dev/null +++ b/modules/nf-core/samtools/bgzip/main.nf @@ -0,0 +1,72 @@ +process SAMTOOLS_BGZIP { + tag "$fasta" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/61/61be440cd54169fcefe5303238847dd466728453a130f4fbc5abd68b514f0b09/data' : + 'community.wave.seqera.io/library/samtools_file:f35ca613dd912bed' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path ("*.bgzip.fa.gz") , emit: fa + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + COMPRESS_TYPE=\$(file -L -b $fasta) + + case "\$COMPRESS_TYPE" in + "gzip compressed data, extra field"*) + # A well-behaved find installation should report: + # Blocked GNU Zip Format (BGZF; gzip compatible) + # But the one in anaconda does nt… + # Assuming the "extra field" implies BGZF, do nothing. + ln -s $fasta ${prefix}.bgzip.fa.gz + ;; + gzip*) + # Recompress non-BGZF gzipped files + zcat $fasta | + bgzip \\ + $args \\ + --threads ${task.cpu} \\ + > ${prefix}.bgzip.fa.gz + ;; + *) + # Compress + bgzip \\ + $args \\ + --threads ${task.cpu} \\ + --stdout \\ + $fasta \\ + > ${prefix}.bgzip.fa.gz + ;; + esac + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//')) + samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//')) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo '' | bgzip > ${prefix}.bgzip.fa.gz + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//')) + samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/bgzip/meta.yml b/modules/nf-core/samtools/bgzip/meta.yml new file mode 100644 index 00000000000..51377c49e81 --- /dev/null +++ b/modules/nf-core/samtools/bgzip/meta.yml @@ -0,0 +1,59 @@ +name: samtools_bgzip +description: Produce a FASTA file compressed with the BGZF algorithm +keywords: + - fasta + - BGZF + - bgzip +tools: + - file: + description: | + The file command is "a file type guesser", that is, a command-line tool that tells you + in words what kind of data a file contains. Unlike most GUI systems, command-line UNIX + systems - with this program leading the charge - don't rely on filename extentions to + tell you the type of a file, but look at the file's actual contents. This is, of course, + more reliable, but requires a bit of I/O. + homepage: https://www.darwinsys.com/file/ + documentation: https://manpages.debian.org/bookworm/file/file.1.en.html + licence: ["BSD-2-clause"] + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:samtools +input: + - - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file, compressed or not. + pattern: "*.{fa,fa.gz,fasta,fasta.gz}" +output: + - fa: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bgzip.fa.gz": + type: file + description: | + A FASTA file compressed with the BGZF algorithm. It will be + the original file if it was already BGZF-compressed. + pattern: "*.{bgzip.fa.gz}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/samtools/bgzip/tests/main.nf.test b/modules/nf-core/samtools/bgzip/tests/main.nf.test new file mode 100644 index 00000000000..2d8379cd14e --- /dev/null +++ b/modules/nf-core/samtools/bgzip/tests/main.nf.test @@ -0,0 +1,89 @@ +nextflow_process { + + name "Test Process SAMTOOLS_BGZIP" + script "../main.nf" + process "SAMTOOLS_BGZIP" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/bgzip" + + test("test_samtools_bgzip - fasta") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_bgzip - fasta bgzipped") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_bgzip - proteome gzipped") { + // This file is not bgziped. It is used to check the re-zipping branch of the case statement in the module. + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_bgzip - fasta stub") { + + options "-stub" + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/bgzip/tests/main.nf.test.snap b/modules/nf-core/samtools/bgzip/tests/main.nf.test.snap new file mode 100644 index 00000000000..14f8498b2ac --- /dev/null +++ b/modules/nf-core/samtools/bgzip/tests/main.nf.test.snap @@ -0,0 +1,142 @@ +{ + "test_samtools_bgzip - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:22:43.985034" + }, + "test_samtools_bgzip - fasta bgzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:22:59.160449" + }, + "test_samtools_bgzip - fasta stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:23:28.612494" + }, + "test_samtools_bgzip - proteome gzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d" + ] + ], + "1": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d" + ] + ], + "versions": [ + "versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-02-05T16:23:13.752005" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/bgzip/tests/tags.yml b/modules/nf-core/samtools/bgzip/tests/tags.yml new file mode 100644 index 00000000000..13e3ed81bbd --- /dev/null +++ b/modules/nf-core/samtools/bgzip/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/bgzip: + - modules/nf-core/samtools/bgzip/**