Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New samtools/bgzip module to produce BGZF-compressed files. #7433

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions modules/nf-core/samtools/bgzip/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- conda-forge
- bioconda

dependencies:
- bioconda::samtools=1.21
- conda-forge::file=5.46
72 changes: 72 additions & 0 deletions modules/nf-core/samtools/bgzip/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
process SAMTOOLS_BGZIP {
tag "$fasta"
label 'process_low'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/61/61be440cd54169fcefe5303238847dd466728453a130f4fbc5abd68b514f0b09/data' :
'community.wave.seqera.io/library/samtools_file:f35ca613dd912bed' }"

input:
tuple val(meta), path(fasta)

output:
tuple val(meta), path ("*.bgzip.fa.gz") , emit: fa
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
COMPRESS_TYPE=\$(file -L -b $fasta)

case "\$COMPRESS_TYPE" in
"gzip compressed data, extra field"*)
# A well-behaved find installation should report:
# Blocked GNU Zip Format (BGZF; gzip compatible)
# But the one in anaconda does nt…
# Assuming the "extra field" implies BGZF, do nothing.
ln -s $fasta ${prefix}.bgzip.fa.gz
;;
gzip*)
# Recompress non-BGZF gzipped files
zcat $fasta |
bgzip \\
$args \\
--threads ${task.cpu} \\
> ${prefix}.bgzip.fa.gz
;;
*)
# Compress
bgzip \\
$args \\
--threads ${task.cpu} \\
--stdout \\
$fasta \\
> ${prefix}.bgzip.fa.gz
;;
esac

cat <<-END_VERSIONS > versions.yml
"${task.process}":
file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//'))
samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//'))
END_VERSIONS
"""

stub:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
echo '' | bgzip > ${prefix}.bgzip.fa.gz

cat <<-END_VERSIONS > versions.yml

"${task.process}":
file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//'))
samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//'))
END_VERSIONS
"""
}
59 changes: 59 additions & 0 deletions modules/nf-core/samtools/bgzip/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: samtools_bgzip
description: Produce a FASTA file compressed with the BGZF algorithm
keywords:
- fasta
- BGZF
- bgzip
tools:
- file:
description: |
The file command is "a file type guesser", that is, a command-line tool that tells you
in words what kind of data a file contains. Unlike most GUI systems, command-line UNIX
systems - with this program leading the charge - don't rely on filename extentions to
tell you the type of a file, but look at the file's actual contents. This is, of course,
more reliable, but requires a bit of I/O.
homepage: https://www.darwinsys.com/file/
documentation: https://manpages.debian.org/bookworm/file/file.1.en.html
licence: ["BSD-2-clause"]
- samtools:
description: |
SAMtools is a set of utilities for interacting with and post-processing
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
These files are generated as output by short read aligners like BWA.
homepage: http://www.htslib.org/
documentation: http://www.htslib.org/doc/samtools.html
doi: 10.1093/bioinformatics/btp352
licence: ["MIT"]
identifier: biotools:samtools
input:
- - meta:
type: map
description: |
Groovy Map containing reference information
e.g. [ id:'test' ]
- fasta:
type: file
description: FASTA file, compressed or not.
pattern: "*.{fa,fa.gz,fasta,fasta.gz}"
output:
- fa:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.bgzip.fa.gz":
type: file
description: |
A FASTA file compressed with the BGZF algorithm. It will be
the original file if it was already BGZF-compressed.
pattern: "*.{bgzip.fa.gz}"
- versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@charles-plessy"
maintainers:
- "@charles-plessy"
89 changes: 89 additions & 0 deletions modules/nf-core/samtools/bgzip/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
nextflow_process {

name "Test Process SAMTOOLS_BGZIP"
script "../main.nf"
process "SAMTOOLS_BGZIP"

tag "modules"
tag "modules_nfcore"
tag "samtools"
tag "samtools/bgzip"

test("test_samtools_bgzip - fasta") {

when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}

test("test_samtools_bgzip - fasta bgzipped") {

when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}

test("test_samtools_bgzip - proteome gzipped") {
// This file is not bgziped. It is used to check the re-zipping branch of the case statement in the module.

when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}

test("test_samtools_bgzip - fasta stub") {

options "-stub"
when {
process {
"""
input[0] = [ [ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out).match() }
)
}
}
}
142 changes: 142 additions & 0 deletions modules/nf-core/samtools/bgzip/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
{
"test_samtools_bgzip - fasta": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"1": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"versions": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:22:43.985034"
},
"test_samtools_bgzip - fasta bgzipped": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"1": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6"
]
],
"versions": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:22:59.160449"
},
"test_samtools_bgzip - fasta stub": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
]
],
"1": [
"versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
]
],
"versions": [
"versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:23:28.612494"
},
"test_samtools_bgzip - proteome gzipped": {
"content": [
{
"0": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d"
]
],
"1": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
],
"fa": [
[
{
"id": "test",
"single_end": false
},
"test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d"
]
],
"versions": [
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a"
]
}
],
"meta": {
"nf-test": "0.9.2",
"nextflow": "24.10.4"
},
"timestamp": "2025-02-05T16:23:13.752005"
}
}
2 changes: 2 additions & 0 deletions modules/nf-core/samtools/bgzip/tests/tags.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
samtools/bgzip:
- modules/nf-core/samtools/bgzip/**
Loading