Skip to content

Commit

Permalink
diamond various fixes (#4210)
Browse files Browse the repository at this point in the history
* diamond fix test parameter name

comp-based-stat needs to be comp_based_stats

also remove param name atribute where possible

* add options missing from command line

- salltitles and sallseqid were in the inputs but unused in the CLI
- fix --unal (reports unaligned queries in fmt 6)
- add --al and modify --un for reporting (un)aligned query sequences
  • Loading branch information
bernt-matthias authored Nov 27, 2021
1 parent 6ecdbaa commit 828c844
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 35 deletions.
84 changes: 60 additions & 24 deletions tools/diamond/diamond.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="bg_diamond" name="Diamond" version="@[email protected]" profile="19.01">
<tool id="bg_diamond" name="Diamond" version="@[email protected]" profile="19.01">
<description>alignment tool for short sequences against a protein database</description>
<macros>
<import>macros.xml</import>
Expand Down Expand Up @@ -62,8 +62,23 @@
--query-cover '$query_cover'
--subject-cover '$subject_cover'
--block-size '$sens_cond.block_size'
#if str($unal) == '1':
--unal 1 --un '$unalqueries'
#if $output_unal
#if "--un" in $output_unal
--un '$unalqueries'
#if $query.ext.startswith("fasta"):
--unfmt fasta
#else
--unfmt fastq
#end if
#end if
#if "--al" in $output_unal
--al '$alqueries'
#if $query.ext.startswith("fasta"):
--alfmt fasta
#else
--alfmt fastq
#end if
#end if
#end if
#if $tax_cond.tax_select == 'file':
--taxonlist `cat '$tax_cond.taxonlistfile' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//'`
Expand All @@ -79,7 +94,7 @@
<option value="blastx">Align DNA query sequences (blastx)</option>
</param>
<when value="blastx">
<param name="query_gencode" argument="--query-gencode" type="select" label="Genetic code used for translation of query in BLASTX mode" help="">
<param argument="--query-gencode" type="select" label="Genetic code used for translation of query in BLASTX mode" help="">
<option value="1">The Standard Code</option>
<option value="2">The Vertebrate Mitochondrial Code</option>
<option value="3">The Yeast Mitochondrial Code</option>
Expand All @@ -100,7 +115,7 @@
<option value="25">Candidate Division SR1 and Gracilibacteria Code</option>
<option value="26">Pachysolen tannophilus Nuclear Code</option>
</param>
<param argument="--min-orf" name="min_orf" type="integer" value="1" label="ignore translated sequences without an open reading frame of at least this length" help="By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature" />
<param argument="--min-orf" type="integer" value="1" label="ignore translated sequences without an open reading frame of at least this length" help="By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature" />

<param name="query_strand" argument="--strand" type="select" label="query strands to search" help="">
<option value="both" selected="True">Both</option>
Expand All @@ -113,21 +128,21 @@
<option value="no" selected="true">no</option>
</param>
<when value="yes">
<param argument="--range-culling" name="range_culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/>
<param argument="--range-culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/>
<param argument="--frameshift" type="integer" value="0" label="frame shift penalty" help="Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads. In the pairwise output format, frameshifts will be indicated by \ and / for a shift by +1 and -1 nucleotide in the direction of translation respectively." />
</when>
<when value="no"/>
</conditional>

<param name="comp_based_stats" argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
<param argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
<option value="0">Disable</option>
<option value="1" selected="True">Default mode (Hauser, 2016)</option>
</param>
</when>
<when value="blastp">
<param name="no_self_hits" argument="--no-self-hits" type="boolean" truevalue="--no-self-hits" falsevalue="" checked="true" label="suppress reporting of identical self hits?" help=""/>
<param argument="--no-self-hits" type="boolean" truevalue="--no-self-hits" falsevalue="" checked="true" label="suppress reporting of identical self hits?" help=""/>

<param name="comp_based_stats" argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
<param argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
<option value="0">Disable</option>
<option value="1" selected="True">Default mode (Hauser, 2016)</option>
<option value="2">Compositional matrix adjust conditioned on sequence properties, simplified (Yu, 2005)</option>
Expand Down Expand Up @@ -234,18 +249,24 @@
</conditional>
<expand macro="hit_filter_macro" />
<param argument="--id" type="integer" value="0" label="Minimum identity percentage to report an alignment" help="" />
<param name="query_cover" argument="--query-cover" type="integer" value="0" label="Minimum query cover percentage to report an alignment" help="" />
<param name="subject_cover" argument="--subject-cover" type="integer" value="0" label="Minimum subject cover percentage to report an alignment" help="" />
<param argument="--unal" type="boolean" truevalue="1" falsevalue="0" checked="false" label="report unaligned queries" help=""/>
<param argument="--query-cover" type="integer" value="0" label="Minimum query cover percentage to report an alignment" help="" />
<param argument="--subject-cover" type="integer" value="0" label="Minimum subject cover percentage to report an alignment" help="" />
<param name="output_unal" type="select" optional="true" multiple="true" label="Output aligned/unaligned queries to separate file" help="">
<option value="--un">Output unaligned queries (--un)</option>
<option value="--al">Output alaligned queries (--al)</option>
</param>
</inputs>
<outputs>
<expand macro="output_macro" />
<data format="fasta" name="unalqueries" label="${tool.name} on ${on_string} (unaligned queries)">
<filter>unal == "1"</filter>
<data format_source="query" name="unalqueries" label="${tool.name} on ${on_string}: unaligned queries">
<filter>output_unal and "--un" in output_unal</filter>
</data>
<data format_source="query" name="alqueries" label="${tool.name} on ${on_string}: aligned queries">
<filter>output_unal and "--un" in output_unal</filter>
</data>
</outputs>
<tests>
<test>
<test expect_num_outputs="3">
<conditional name="method_cond">
<param name="method_select" value="blastp" />
</conditional>
Expand All @@ -256,13 +277,15 @@
</conditional>
<conditional name="output">
<param name="outfmt" value="6"/>
<param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,cigar,scovhsp,sskingdoms,skingdoms,sphylums"/>
<!-- removed ,cigar from test: https://github.com/bbuchfink/diamond/issues/532 -->
<param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,sskingdoms,skingdoms,sphylums"/>
<param name="unal" value="true"/>
</conditional>
<conditional name="sens_cond">
<param name="sensitivity" value=""/>
</conditional>
<param name="matrix" value="BLOSUM62"/>
<param name="comp-based-stat" value="1"/>
<param name="comp_based_stats" value="1"/>
<param name="masking" value="1"/>
<conditional name="hit_filter">
<param name="hit_filter_select" value="max"/>
Expand All @@ -277,9 +300,20 @@
<conditional name="sens_cond">
<param name="block_size" value="2"/>
</conditional>
<param name="output_unal" value="--al,--un"/>
<output name="unalqueries">
<assert_contents>
<has_line line=">shuffled sequence that should go to unaligned"/>
</assert_contents>
</output>
<output name="alqueries">
<assert_contents>
<has_line line=">sequence more text"/>
</assert_contents>
</output>
<output name="blast_tabular" file="diamond_results.tabular"/>
</test>
<test>
<test expect_num_outputs="1">
<conditional name="method_cond">
<param name="method_select" value="blastp" />
</conditional>
Expand All @@ -300,7 +334,7 @@
<param name="sensitivity" value=""/>
</conditional>
<param name="matrix" value="BLOSUM62"/>
<param name="comp-based-stat" value="1"/>
<param name="comp_based_stats" value="1"/>
<param name="masking" value="1"/>
<conditional name="hit_filter">
<param name="hit_filter_select" value="max"/>
Expand All @@ -317,7 +351,7 @@
</conditional>
<output name="blast_tabular" file="diamond_results.wtax.tabular"/>
</test>
<test>
<test expect_num_outputs="1">
<conditional name="method_cond">
<param name="method_select" value="blastx" />
<conditional name="frameshift_cond">
Expand All @@ -336,7 +370,7 @@
<param name="sensitivity" value=""/>
</conditional>
<param name="matrix" value="BLOSUM62"/>
<param name="comp-based-stat" value="1"/>
<param name="comp_based_stats" value="1"/>
<param name="masking" value="1"/>
<conditional name="hit_filter">
<param name="hit_filter_select" value="top"/>
Expand All @@ -353,7 +387,7 @@
</conditional>
<output name="blast_tabular" file="diamond_results.pairwise"/>
</test>
<test>
<test expect_num_outputs="1">
<conditional name="method_cond">
<param name="method_select" value="blastp" />
</conditional>
Expand All @@ -364,10 +398,12 @@
</conditional>
<conditional name="output">
<param name="outfmt" value="100"/>
<param name="salltitles" value="false"/>
<param name="sallseqid" value="false"/>
</conditional>
<output name="daa_output" file="diamond_results.daa" compare="sim_size" delta="10"/>
</test>
<test>
<test expect_num_outputs="1">
<conditional name="method_cond">
<param name="method_select" value="blastx" />
<conditional name="frameshift_cond">
Expand All @@ -386,7 +422,7 @@
<param name="sensitivity" value=""/>
</conditional>
<param name="matrix" value="BLOSUM62"/>
<param name="comp-based-stat" value="1"/>
<param name="comp_based_stats" value="1"/>
<param name="masking" value="1"/>
<conditional name="hit_filter">
<param name="hit_filter_select" value="top"/>
Expand Down
2 changes: 1 addition & 1 deletion tools/diamond/diamond_makedb.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="bg_diamond_makedb" name="Diamond makedb" version="@VERSION@" profile="19.01">
<tool id="bg_diamond_makedb" name="Diamond makedb" version="@TOOL_VERSION@" profile="19.01">
<description>Build database from a FASTA file</description>
<macros>
<import>macros.xml</import>
Expand Down
8 changes: 4 additions & 4 deletions tools/diamond/diamond_view.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="bg_diamond_view" name="Diamond view" version="@VERSION@" profile="19.01">
<tool id="bg_diamond_view" name="Diamond view" version="@[email protected]" profile="19.01">
<description>generate formatted output from DAA files</description>
<macros>
<import>macros.xml</import>
Expand Down Expand Up @@ -29,7 +29,7 @@
<expand macro="output_macro" />
</outputs>
<tests>
<test>
<test expect_num_outputs="1">
<param name="daa" ftype="daa" value="diamond_results.daa" />
<conditional name="output">
<param name="outfmt" value="5"/>
Expand All @@ -40,15 +40,15 @@
</conditional>
<output name="blast_tabular" file="diamond_results.xml"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="daa" ftype="daa" value="diamond_results.daa" />
<conditional name="output">
<param name="outfmt" value="6"/>
<param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,cigar,scovhsp"/>
</conditional>
<output name="blast_tabular" file="diamond_view_results.tabular"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="daa" ftype="daa" value="diamond_results.daa" />
<conditional name="output">
<param name="outfmt" value="101"/>
Expand Down
14 changes: 10 additions & 4 deletions tools/diamond/macros.xml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
<macros>
<token name="@VERSION@">2.0.8</token>
<token name="@TOOL_VERSION@">2.0.8</token>

<xml name="requirements">
<requirements>
<requirement type="package" version="@VERSION@">diamond</requirement>
<requirement type="package" version="@TOOL_VERSION@">diamond</requirement>
</requirements>
</xml>

Expand Down Expand Up @@ -69,6 +69,7 @@
<option value="cigar">Cigar</option>
<yield/>
</param>
<param argument="--unal" type="boolean" label="Report unaligned queries" truevalue="1" falsevalue="0" checked="false"/>
</when>
<when value="100">
<param argument="--salltitles" type="boolean" truevalue="--salltitles" falsevalue="" checked="true" label="Include full subject titles in DAA file?" help=""/>
Expand Down Expand Up @@ -99,11 +100,11 @@ ments whose score is at most 10% lower than the best alignment score for a query
</xml>

<xml name="block_size_low_sens">
<param name="block_size" argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time" help="" />
<param argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time" help="" />
</xml>

<xml name="block_size_hi_sens">
<param name="block_size" argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time" help="" />
<param argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time" help="" />
</xml>

<xml name="citations">
Expand Down Expand Up @@ -145,11 +146,16 @@ ments whose score is at most 10% lower than the best alignment score for a query
#else if $output.outfmt == "6"
--outfmt '6' #echo ' '.join(str($output.fields).split(','))
--out '$blast_tabular'
--unal $output.unal
#else if $output.outfmt == "100"
--outfmt '100'
$output.salltitles
$output.sallseqid
--out output.daa
#else if $output.outfmt == "101"
--outfmt '101'
$output.salltitles
$output.sallseqid
--out '$sam_output'
#else if $output.outfmt == "102"
--outfmt '102'
Expand Down
5 changes: 3 additions & 2 deletions tools/diamond/test-data/diamond_results.tabular
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 283 1 284 1.44e-205 550 94M1D189M 100 0 0 0
sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 283 1 284 5.77e-150 409 105M1D178M 100 0 0 0
sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 283 1 284 1.44e-205 550 100 0 0 0
sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 283 1 284 5.77e-150 409 100 0 0 0
shuffled * -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 * * *
6 changes: 6 additions & 0 deletions tools/diamond/test-data/protein.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@ EWIWGGFSVDKATLNRFFAFHFILFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
IENY
>shuffled sequence that should go to unaligned
XLPLILMLLGISPGSFEHTVAGGIWTSLMLFLPGYPGVGFLMLLVITVPALNFKFGFMLL
LKPTTNIIKTLVLALTHADDPLSFPWLNYMPPAADFNGLFTNAGATTTLYQIPYEGSFYL
AAIYGSMLHENHYLYRSMTPVGWLHLGDSGLRFMLLPIYYARITYDNVPAGWFLSVNTIL
GLTAILLEAIKALMANYSESQEPFCFSTGMKHSFIISDILGWDMSLYIILLIPHTNPFVL
TFLTLILWLDILSRYTLLQVNLIIFMTRHGHFQIADIWYWLKS

0 comments on commit 828c844

Please sign in to comment.