-
Notifications
You must be signed in to change notification settings - Fork 752
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New
samtools/bgzip
module to produced BGZF-compressed files.
This module can take a FASTA file regardless if it is gzipped or not. - If the file was not compressed, it is compressed with bgzip. - If the file was compressed with gzip, it uncompressed and recompressed with bgzip. - If the file was already compressed with bgzip, the module returns it unchanged. This solved the issue that some pipelines request input data to be already bgzipped, but data providers like NCBI do not use bgzip for compressing their files (in particular genomes).
- Loading branch information
1 parent
094299f
commit 6c9d861
Showing
6 changed files
with
371 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
|
||
dependencies: | ||
- bioconda::samtools=1.21 | ||
- conda-forge::file=5.46 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
process SAMTOOLS_BGZIP { | ||
tag "$fasta" | ||
label 'process_low' | ||
|
||
conda "${moduleDir}/environment.yml" | ||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/61/61be440cd54169fcefe5303238847dd466728453a130f4fbc5abd68b514f0b09/data' : | ||
'community.wave.seqera.io/library/samtools_file:f35ca613dd912bed' }" | ||
|
||
input: | ||
tuple val(meta), path(fasta) | ||
|
||
output: | ||
tuple val(meta), path ("*.bgzip.fa.gz") , emit: fa | ||
path "versions.yml" , emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
def prefix = task.ext.prefix ?: "${meta.id}" | ||
""" | ||
COMPRESS_TYPE=\$(file -L -b $fasta) | ||
case "\$COMPRESS_TYPE" in | ||
"gzip compressed data, extra field"*) | ||
# A well-behaved find installation should report: | ||
# Blocked GNU Zip Format (BGZF; gzip compatible) | ||
# But the one in anaconda does nt… | ||
# Assuming the "extra field" implies BGZF, do nothing. | ||
ln -s $fasta ${prefix}.bgzip.fa.gz | ||
;; | ||
gzip*) | ||
# Recompress non-BGZF gzipped files | ||
zcat $fasta | | ||
bgzip \\ | ||
$args \\ | ||
--threads ${task.cpu} \\ | ||
> ${prefix}.bgzip.fa.gz | ||
;; | ||
*) | ||
# Compress | ||
bgzip \\ | ||
$args \\ | ||
--threads ${task.cpu} \\ | ||
--stdout \\ | ||
$fasta \\ | ||
> ${prefix}.bgzip.fa.gz | ||
;; | ||
esac | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//')) | ||
samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//')) | ||
END_VERSIONS | ||
""" | ||
|
||
stub: | ||
def prefix = task.ext.prefix ?: "${meta.id}" | ||
""" | ||
echo '' | bgzip > ${prefix}.bgzip.fa.gz | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
file: \$(echo \$(file --version 2>&1 | head -n1 | sed 's/file-//')) | ||
samtools: \$(echo \$(samtools --version 2>&1 | head -n1 | sed 's/^.*samtools //; s/Using.*\$//')) | ||
END_VERSIONS | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
name: samtools_bgzip | ||
description: Produce a FASTA file compressed with the BGZF algorithm | ||
keywords: | ||
- fasta | ||
- BGZF | ||
- bgzip | ||
tools: | ||
- file: | ||
description: | | ||
The file command is "a file type guesser", that is, a command-line tool that tells you | ||
in words what kind of data a file contains. Unlike most GUI systems, command-line UNIX | ||
systems - with this program leading the charge - don't rely on filename extentions to | ||
tell you the type of a file, but look at the file's actual contents. This is, of course, | ||
more reliable, but requires a bit of I/O. | ||
homepage: https://www.darwinsys.com/file/ | ||
documentation: https://manpages.debian.org/bookworm/file/file.1.en.html | ||
licence: ["BSD-2-clause"] | ||
- samtools: | ||
description: | | ||
SAMtools is a set of utilities for interacting with and post-processing | ||
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. | ||
These files are generated as output by short read aligners like BWA. | ||
homepage: http://www.htslib.org/ | ||
documentation: http://www.htslib.org/doc/samtools.html | ||
doi: 10.1093/bioinformatics/btp352 | ||
licence: ["MIT"] | ||
identifier: biotools:samtools | ||
input: | ||
- - meta: | ||
type: map | ||
description: | | ||
Groovy Map containing reference information | ||
e.g. [ id:'test' ] | ||
- fasta: | ||
type: file | ||
description: FASTA file, compressed or not. | ||
pattern: "*.{fa,fa.gz,fasta,fasta.gz}" | ||
output: | ||
- fa: | ||
- meta: | ||
type: map | ||
description: | | ||
Groovy Map containing sample information | ||
e.g. [ id:'test', single_end:false ] | ||
- "*.bgzip.fa.gz": | ||
type: file | ||
description: | | ||
A FASTA file compressed with the BGZF algorithm. It will be | ||
the original file if it was already BGZF-compressed. | ||
pattern: "*.{bgzip.fa.gz}" | ||
- versions: | ||
- versions.yml: | ||
type: file | ||
description: File containing software versions | ||
pattern: "versions.yml" | ||
authors: | ||
- "@charles-plessy" | ||
maintainers: | ||
- "@charles-plessy" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
nextflow_process { | ||
|
||
name "Test Process SAMTOOLS_BGZIP" | ||
script "../main.nf" | ||
process "SAMTOOLS_BGZIP" | ||
|
||
tag "modules" | ||
tag "modules_nfcore" | ||
tag "samtools" | ||
tag "samtools/bgzip" | ||
|
||
test("test_samtools_bgzip - fasta") { | ||
|
||
when { | ||
process { | ||
""" | ||
input[0] = [ [ id:'test', single_end:false ], // meta map | ||
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] | ||
""" | ||
} | ||
} | ||
|
||
then { | ||
assertAll( | ||
{ assert process.success }, | ||
{ assert snapshot(process.out).match() } | ||
) | ||
} | ||
} | ||
|
||
test("test_samtools_bgzip - fasta bgzipped") { | ||
|
||
when { | ||
process { | ||
""" | ||
input[0] = [ [ id:'test', single_end:false ], // meta map | ||
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) ] | ||
""" | ||
} | ||
} | ||
|
||
then { | ||
assertAll( | ||
{ assert process.success }, | ||
{ assert snapshot(process.out).match() } | ||
) | ||
} | ||
} | ||
|
||
test("test_samtools_bgzip - proteome gzipped") { | ||
// This file is not bgziped. It is used to check the re-zipping branch of the case statement in the module. | ||
|
||
when { | ||
process { | ||
""" | ||
input[0] = [ [ id:'test', single_end:false ], // meta map | ||
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ] | ||
""" | ||
} | ||
} | ||
|
||
then { | ||
assertAll( | ||
{ assert process.success }, | ||
{ assert snapshot(process.out).match() } | ||
) | ||
} | ||
} | ||
|
||
test("test_samtools_bgzip - fasta stub") { | ||
|
||
options "-stub" | ||
when { | ||
process { | ||
""" | ||
input[0] = [ [ id:'test', single_end:false ], // meta map | ||
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] | ||
""" | ||
} | ||
} | ||
|
||
then { | ||
assertAll( | ||
{ assert process.success }, | ||
{ assert snapshot(process.out).match() } | ||
) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
{ | ||
"test_samtools_bgzip - fasta": { | ||
"content": [ | ||
{ | ||
"0": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" | ||
] | ||
], | ||
"1": [ | ||
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" | ||
], | ||
"fa": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" | ||
] | ||
], | ||
"versions": [ | ||
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" | ||
] | ||
} | ||
], | ||
"meta": { | ||
"nf-test": "0.9.2", | ||
"nextflow": "24.10.4" | ||
}, | ||
"timestamp": "2025-02-05T16:22:43.985034" | ||
}, | ||
"test_samtools_bgzip - fasta bgzipped": { | ||
"content": [ | ||
{ | ||
"0": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" | ||
] | ||
], | ||
"1": [ | ||
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" | ||
], | ||
"fa": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,6e9fe4042a72f2345f644f239272b7e6" | ||
] | ||
], | ||
"versions": [ | ||
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" | ||
] | ||
} | ||
], | ||
"meta": { | ||
"nf-test": "0.9.2", | ||
"nextflow": "24.10.4" | ||
}, | ||
"timestamp": "2025-02-05T16:22:59.160449" | ||
}, | ||
"test_samtools_bgzip - fasta stub": { | ||
"content": [ | ||
{ | ||
"0": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" | ||
] | ||
], | ||
"1": [ | ||
"versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4" | ||
], | ||
"fa": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" | ||
] | ||
], | ||
"versions": [ | ||
"versions.yml:md5,18b35ac93df8e34bed8e6c086d6394c4" | ||
] | ||
} | ||
], | ||
"meta": { | ||
"nf-test": "0.9.2", | ||
"nextflow": "24.10.4" | ||
}, | ||
"timestamp": "2025-02-05T16:23:28.612494" | ||
}, | ||
"test_samtools_bgzip - proteome gzipped": { | ||
"content": [ | ||
{ | ||
"0": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d" | ||
] | ||
], | ||
"1": [ | ||
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" | ||
], | ||
"fa": [ | ||
[ | ||
{ | ||
"id": "test", | ||
"single_end": false | ||
}, | ||
"test.bgzip.fa.gz:md5,db0ecd5dbce6bf9730685b94ec87854d" | ||
] | ||
], | ||
"versions": [ | ||
"versions.yml:md5,dcbb9ab7afbb0348506d508dac612f9a" | ||
] | ||
} | ||
], | ||
"meta": { | ||
"nf-test": "0.9.2", | ||
"nextflow": "24.10.4" | ||
}, | ||
"timestamp": "2025-02-05T16:23:13.752005" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
samtools/bgzip: | ||
- modules/nf-core/samtools/bgzip/** |