Skip to content

Commit

Permalink
New samtools/bgzip module to produced BGZF-compressed files.
Browse files Browse the repository at this point in the history
This module can take a FASTA file regardless if it is gzipped or not.

 - If the file was not compressed, it is compressed with bgzip.
 - If the file was compressed with gzip, it uncompressed and
   recompressed with bgzip.
 - If the file was already compressed with bgzip, the module
   returns it unchanged.

This solved the issue that some pipelines request input data to be
already bgzipped, but data providers like NCBI do not use bgzip for
compressing their files (in particular genomes).
  • Loading branch information
charles-plessy committed Feb 5, 2025
1 parent 094299f commit 6c9d861
Show file tree
Hide file tree
Showing 6 changed files with 371 additions and 0 deletions.
7 changes: 7 additions & 0 deletions modules/nf-core/samtools/bgzip/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- bioconda

dependencies:
- bioconda::samtools=1.21
- conda-forge::file=5.46
72 changes: 72 additions & 0 deletions modules/nf-core/samtools/bgzip/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
process SAMTOOLS_BGZIP {
tag "$fasta"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/61/61be440cd54169fcefe5303238847dd466728453a130f4fbc5abd68b514f0b09/data' :
'community.wave.seqera.io/library/samtools_file:f35ca613dd912bed' }"

input:
tuple val(meta), path(fasta)

output:
tuple val(meta), path ("*.bgzip.fa.gz") , emit: fa
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
COMPRESS_TYPE=\$(file -L -b $fasta)
case "\$COMPRESS_TYPE" in
"gzip compressed data, extra field"*)
# A well-behaved find installation should report:
# Blocked GNU Zip Format (BGZF; gzip compatible)
# But the one in anaconda does nt…
# Assuming the "extra field" implies BGZF, do nothing.
ln -s $fasta ${prefix}.bgzip.fa.gz
;;
gzip*)
# Recompress non-BGZF gzipped files
zcat $fasta |
bgzip \\
$args \\
--threads ${task.cpu} \\
> ${prefix}.bgzip.fa.gz
;;
*)
# Compress
bgzip \\
$args \\
--threads ${task.cpu} \\
--stdout \\
$fasta \\
> ${prefix}.bgzip.fa.gz
;;
esac
cat <<-END_VERSIONS > versions.yml
"${task.process}":
file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//'))
samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//'))
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
echo '' | bgzip > ${prefix}.bgzip.fa.gz
cat <<-END_VERSIONS > versions.yml
"${task.process}":
file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//'))
samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//'))
END_VERSIONS
"""
}
59 changes: 59 additions & 0 deletions modules/nf-core/samtools/bgzip/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: samtools_bgzip
description: Produce a FASTA file compressed with the BGZF algorithm
keywords:
- fasta
- BGZF
- bgzip
tools:
- file:
description: |
The file command is "a file type guesser", that is, a command-line tool that tells you
in words what kind of data a file contains. Unlike most GUI systems, command-line UNIX
systems - with this program leading the charge - don't rely on filename extentions to
tell you the type of a file, but look at the file's actual contents. This is, of course,
more reliable, but requires a bit of I/O.
homepage: https://www.darwinsys.com/file/
documentation: https://manpages.debian.org/bookworm/file/file.1.en.html
licence: ["BSD-2-clause"]
- samtools:
description: |
SAMtools is a set of utilities for interacting with and post-processing
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
These files are generated as output by short read aligners like BWA.
homepage: http://www.htslib.org/
documentation: http://www.htslib.org/doc/samtools.html
doi: 10.1093/bioinformatics/btp352
licence: ["MIT"]
identifier: biotools:samtools
input:
- - meta:
type: map
description: |
Groovy Map containing reference information
e.g. [ id:'test' ]
- fasta:
type: file
description: FASTA file, compressed or not.
pattern: "*.{fa,fa.gz,fasta,fasta.gz}"
output:
- fa:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.bgzip.fa.gz":
type: file
description: |
A FASTA file compressed with the BGZF algorithm. It will be
the original file if it was already BGZF-compressed.
pattern: "*.{bgzip.fa.gz}"
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@charles-plessy"
maintainers:
- "@charles-plessy"
89 changes: 89 additions & 0 deletions modules/nf-core/samtools/bgzip/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
nextflow_process {

name "Test Process SAMTOOLS_BGZIP"
script "../main.nf"
process "SAMTOOLS_BGZIP"

tag "modules"
tag "modules_nfcore"
tag "samtools"
tag "samtools/bgzip"

test("test_samtools_bgzip - fasta") {

when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}

test("test_samtools_bgzip - fasta bgzipped") {

when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}

test("test_samtools_bgzip - proteome gzipped") {
// This file is not bgziped. It is used to check the re-zipping branch of the case statement in the module.

when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}

test("test_samtools_bgzip - fasta stub") {

options "-stub"
when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}
}
142 changes: 142 additions & 0 deletions modules/nf-core/samtools/bgzip/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"test_samtools_bgzip - fasta": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"1": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"versions": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:22:43.985034"
},
"test_samtools_bgzip - fasta bgzipped": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"1": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"versions": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:22:59.160449"
},
"test_samtools_bgzip - fasta stub": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
]
],
"1": [
"versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
]
],
"versions": [
"versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:23:28.612494"
},
"test_samtools_bgzip - proteome gzipped": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d"
]
],
"1": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d"
]
],
"versions": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:23:13.752005"
}
}
2 changes: 2 additions & 0 deletions modules/nf-core/samtools/bgzip/tests/tags.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
samtools/bgzip:
- modules/nf-core/samtools/bgzip/**

0 comments on commit 6c9d861

Please sign in to comment.