diamond various fixes (#4210)

* diamond fix test parameter name comp-based-stat needs to be comp_based_stats also remove param name atribute where possible * add options missing from command line - salltitles and sallseqid were in the inputs but unused in the CLI - fix --unal (reports unaligned queries in fmt 6) - add --al and modify --un for reporting (un)aligned query sequences
galaxyproject · Nov 27, 2021 · 828c844 · 828c844
1 parent 6ecdbaa
commit 828c844
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 35 deletions.
diff --git a/tools/diamond/diamond.xml b/tools/diamond/diamond.xml
@@ -1,4 +1,4 @@
-<tool id="bg_diamond" name="Diamond" version="@[email protected]" profile="19.01">
+<tool id="bg_diamond" name="Diamond" version="@[email protected]" profile="19.01">
     <description>alignment tool for short sequences against a protein database</description>
     <macros>
         <import>macros.xml</import>
@@ -62,8 +62,23 @@
         --query-cover '$query_cover'
         --subject-cover '$subject_cover'
         --block-size '$sens_cond.block_size'
-        #if str($unal) == '1':
-            --unal 1 --un '$unalqueries'
+        #if $output_unal
+            #if "--un" in $output_unal
+                --un '$unalqueries'
+                #if $query.ext.startswith("fasta"):
+                    --unfmt fasta
+                #else
+                    --unfmt fastq
+                #end if
+            #end if
+            #if "--al" in $output_unal
+                --al '$alqueries'
+                #if $query.ext.startswith("fasta"):
+                    --alfmt fasta
+                #else
+                    --alfmt fastq
+                #end if
+            #end if
         #end if
         #if $tax_cond.tax_select == 'file':
             --taxonlist `cat '$tax_cond.taxonlistfile' | grep -v "^#" | grep -v "^$" | tr "\n" "," | sed 's/,$//'`
@@ -79,7 +94,7 @@
                 <option value="blastx">Align DNA query sequences (blastx)</option>
             </param>
             <when value="blastx">
-                <param name="query_gencode" argument="--query-gencode" type="select" label="Genetic code used for translation of query in BLASTX mode" help="">
+                <param argument="--query-gencode" type="select" label="Genetic code used for translation of query in BLASTX mode" help="">
                     <option value="1">The Standard Code</option>
                     <option value="2">The Vertebrate Mitochondrial Code</option>
                     <option value="3">The Yeast Mitochondrial Code</option>
@@ -100,7 +115,7 @@
                     <option value="25">Candidate Division SR1 and Gracilibacteria Code</option>
                     <option value="26">Pachysolen tannophilus Nuclear Code</option>
                 </param>
-                <param argument="--min-orf" name="min_orf" type="integer" value="1" label="ignore translated sequences without an open reading frame of at least this length" help="By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature" />
+                <param argument="--min-orf" type="integer" value="1" label="ignore translated sequences without an open reading frame of at least this length" help="By default this feature is disabled for sequences of length below 30, set to 20 for sequences of length below 100, and set to 40 otherwise. Setting this option to 1 will disable this feature" />
 
                 <param name="query_strand" argument="--strand" type="select" label="query strands to search" help="">
                     <option value="both" selected="True">Both</option>
@@ -113,21 +128,21 @@
                         <option value="no" selected="true">no</option>
                     </param>
                     <when value="yes">
-                        <param argument="--range-culling" name="range_culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/>
+                        <param argument="--range-culling" type="boolean" truevalue="--range-culling" falsevalue="" checked="false" label="restrict hit culling to overlapping query ranges" help="This feature is designed for long query DNA sequences that may span several genes. In these cases, the default of reporting the 25 best overall hits could cause hits to a lower scoring gene to be overshadowed. But just increasing the number of alignments reported will bloat the output size and reduce performance. Using this feature along with -k 25 (default), a hit will only be deleted if at least 50% of its query range is spanned by at least 25 higher or equal scoring hits. Using this feature along with --top 10, a hit will only be deleted if its score is more than 10% lower than that of a higher scoring hit over at least 50% of its query range. The percentage is configurable using --range-cover. Note that this feature is currently only available in frameshift alignment mode"/>
                         <param argument="--frameshift" type="integer" value="0" label="frame shift penalty" help="Values around 15 are reasonable for this parameter. Enabling this feature will have the aligner tolerate missing bases in DNA sequences and is most recommended for long, error-prone sequences like MinION reads. In the pairwise output format, frameshifts will be indicated by \ and / for a shift by +1 and -1 nucleotide in the direction of translation respectively." />
                     </when>
                     <when value="no"/>
                 </conditional>
 
-                <param name="comp_based_stats" argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
+                <param argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
                     <option value="0">Disable</option>
                     <option value="1" selected="True">Default mode (Hauser, 2016)</option>
                 </param>
             </when>
             <when value="blastp">
-                <param name="no_self_hits" argument="--no-self-hits" type="boolean" truevalue="--no-self-hits" falsevalue="" checked="true" label="suppress reporting of identical self hits?" help=""/>
+                <param argument="--no-self-hits" type="boolean" truevalue="--no-self-hits" falsevalue="" checked="true" label="suppress reporting of identical self hits?" help=""/>
 
-                <param name="comp_based_stats" argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
+                <param argument="--comp-based-stats" type="select" label="Composition based statistics" help="Compositionally biased sequences often cause false positive matches, which are effectively filtered by this algorithm in a way similar to the composition based statistics used by BLAST">
                     <option value="0">Disable</option>
                     <option value="1" selected="True">Default mode (Hauser, 2016)</option>
                     <option value="2">Compositional matrix adjust conditioned on sequence properties, simplified (Yu, 2005)</option>
@@ -234,18 +249,24 @@
         </conditional>
         <expand macro="hit_filter_macro" />
         <param argument="--id" type="integer" value="0" label="Minimum identity percentage to report an alignment" help="" />
-        <param name="query_cover" argument="--query-cover" type="integer" value="0" label="Minimum query cover percentage to report an alignment" help="" />
-        <param name="subject_cover" argument="--subject-cover" type="integer" value="0" label="Minimum subject cover percentage to report an alignment" help="" />
-        <param argument="--unal" type="boolean" truevalue="1" falsevalue="0" checked="false" label="report unaligned queries" help=""/>
+        <param argument="--query-cover" type="integer" value="0" label="Minimum query cover percentage to report an alignment" help="" />
+        <param argument="--subject-cover" type="integer" value="0" label="Minimum subject cover percentage to report an alignment" help="" />
+        <param name="output_unal" type="select" optional="true" multiple="true" label="Output aligned/unaligned queries to separate file" help="">
+            <option value="--un">Output unaligned queries (--un)</option>
+            <option value="--al">Output alaligned queries (--al)</option>
+        </param>
     </inputs>
     <outputs>
         <expand macro="output_macro" />
-        <data format="fasta" name="unalqueries" label="${tool.name} on ${on_string} (unaligned queries)">
-            <filter>unal == "1"</filter>
+        <data format_source="query" name="unalqueries" label="${tool.name} on ${on_string}: unaligned queries">
+            <filter>output_unal and "--un" in output_unal</filter>
+        </data>
+        <data format_source="query" name="alqueries" label="${tool.name} on ${on_string}: aligned queries">
+            <filter>output_unal and "--un" in output_unal</filter>
         </data>
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="3">
             <conditional name="method_cond">
                 <param name="method_select" value="blastp" />
             </conditional>
@@ -256,13 +277,15 @@
             </conditional>
             <conditional name="output">
                 <param name="outfmt" value="6"/>
-                <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,cigar,scovhsp,sskingdoms,skingdoms,sphylums"/>
+                <!-- removed ,cigar from test: https://github.com/bbuchfink/diamond/issues/532 -->
+                <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,sskingdoms,skingdoms,sphylums"/>
+                <param name="unal" value="true"/>
             </conditional>
             <conditional name="sens_cond">
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="comp-based-stat" value="1"/>
+            <param name="comp_based_stats" value="1"/>
             <param name="masking" value="1"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="max"/>
@@ -277,9 +300,20 @@
             <conditional name="sens_cond">
                 <param name="block_size" value="2"/>
             </conditional>
+            <param name="output_unal" value="--al,--un"/>
+            <output name="unalqueries">
+                <assert_contents>
+                    <has_line line=">shuffled sequence that should go to unaligned"/>
+                </assert_contents>
+            </output>
+            <output name="alqueries">
+                <assert_contents>
+                    <has_line line=">sequence more text"/>
+                </assert_contents>
+            </output>
             <output name="blast_tabular" file="diamond_results.tabular"/>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastp" />
             </conditional>
@@ -300,7 +334,7 @@
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="comp-based-stat" value="1"/>
+            <param name="comp_based_stats" value="1"/>
             <param name="masking" value="1"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="max"/>
@@ -317,7 +351,7 @@
             </conditional>
             <output name="blast_tabular" file="diamond_results.wtax.tabular"/>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastx" />
                 <conditional name="frameshift_cond">
@@ -336,7 +370,7 @@
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="comp-based-stat" value="1"/>
+            <param name="comp_based_stats" value="1"/>
             <param name="masking" value="1"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="top"/>
@@ -353,7 +387,7 @@
             </conditional>
             <output name="blast_tabular" file="diamond_results.pairwise"/>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastp" />
             </conditional>
@@ -364,10 +398,12 @@
             </conditional>
             <conditional name="output">
                 <param name="outfmt" value="100"/>
+                <param name="salltitles" value="false"/>
+                <param name="sallseqid" value="false"/>
             </conditional>
             <output name="daa_output" file="diamond_results.daa" compare="sim_size" delta="10"/>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <conditional name="method_cond">
                 <param name="method_select" value="blastx" />
                 <conditional name="frameshift_cond">
@@ -386,7 +422,7 @@
                 <param name="sensitivity" value=""/>
             </conditional>
             <param name="matrix" value="BLOSUM62"/>
-            <param name="comp-based-stat" value="1"/>
+            <param name="comp_based_stats" value="1"/>
             <param name="masking" value="1"/>
             <conditional name="hit_filter">
                 <param name="hit_filter_select" value="top"/>

diff --git a/tools/diamond/diamond_makedb.xml b/tools/diamond/diamond_makedb.xml
@@ -1,4 +1,4 @@
-<tool id="bg_diamond_makedb" name="Diamond makedb" version="@VERSION@" profile="19.01">
+<tool id="bg_diamond_makedb" name="Diamond makedb" version="@TOOL_VERSION@" profile="19.01">
     <description>Build database from a FASTA file</description>
     <macros>
         <import>macros.xml</import>

diff --git a/tools/diamond/diamond_view.xml b/tools/diamond/diamond_view.xml
@@ -1,4 +1,4 @@
-<tool id="bg_diamond_view" name="Diamond view" version="@VERSION@" profile="19.01">
+<tool id="bg_diamond_view" name="Diamond view" version="@[email protected]" profile="19.01">
     <description>generate formatted output from DAA files</description>
     <macros>
         <import>macros.xml</import>
@@ -29,7 +29,7 @@
         <expand macro="output_macro" />
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="1">
             <param name="daa" ftype="daa" value="diamond_results.daa" />
             <conditional name="output">
                 <param name="outfmt" value="5"/>
@@ -40,15 +40,15 @@
             </conditional>
             <output name="blast_tabular" file="diamond_results.xml"/>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <param name="daa" ftype="daa" value="diamond_results.daa" />
             <conditional name="output">
                 <param name="outfmt" value="6"/>
                 <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,cigar,scovhsp"/>
             </conditional>
             <output name="blast_tabular" file="diamond_view_results.tabular"/>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <param name="daa" ftype="daa" value="diamond_results.daa" />
             <conditional name="output">
                 <param name="outfmt" value="101"/>

diff --git a/tools/diamond/macros.xml b/tools/diamond/macros.xml
@@ -1,9 +1,9 @@
 <macros>
-    <token name="@VERSION@">2.0.8</token>
+    <token name="@TOOL_VERSION@">2.0.8</token>
 
     <xml name="requirements">
         <requirements>
-          <requirement type="package" version="@VERSION@">diamond</requirement>
+          <requirement type="package" version="@TOOL_VERSION@">diamond</requirement>
         </requirements>
     </xml>
 
@@ -69,6 +69,7 @@
                     <option value="cigar">Cigar</option>
                     <yield/>
                 </param>
+                <param argument="--unal" type="boolean" label="Report unaligned queries" truevalue="1" falsevalue="0" checked="false"/>
             </when>
             <when value="100">
                 <param argument="--salltitles" type="boolean" truevalue="--salltitles" falsevalue="" checked="true" label="Include full subject titles in DAA file?" help=""/>
@@ -99,11 +100,11 @@ ments whose score is at most 10% lower than the best alignment score for a query
     </xml>
 
     <xml name="block_size_low_sens">
-        <param name="block_size" argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time" help="" />
+        <param argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time" help="" />
     </xml>
 
     <xml name="block_size_hi_sens">
-        <param name="block_size" argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time" help="" />
+        <param argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time" help="" />
     </xml>
 
     <xml name="citations">
@@ -145,11 +146,16 @@ ments whose score is at most 10% lower than the best alignment score for a query
         #else if $output.outfmt == "6"
             --outfmt '6' #echo ' '.join(str($output.fields).split(','))
             --out '$blast_tabular'
+            --unal $output.unal
         #else if $output.outfmt == "100"
             --outfmt '100'
+            $output.salltitles
+            $output.sallseqid
             --out output.daa
         #else if $output.outfmt == "101"
             --outfmt '101'
+            $output.salltitles
+            $output.sallseqid
             --out '$sam_output'
         #else if $output.outfmt == "102"
             --outfmt '102'

diff --git a/tools/diamond/test-data/diamond_results.tabular b/tools/diamond/test-data/diamond_results.tabular
@@ -1,2 +1,3 @@
-sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	283	1	284	1.44e-205	550	94M1D189M	100	0	0	0
-sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	283	1	284	5.77e-150	409	105M1D178M	100	0	0	0
+sequence	gi|5524211|gb|AAD44166.1|	99.6	284	0	1	1	283	1	284	1.44e-205	550	100	0	0	0
+sequence	gi|5524212|gb|AAD44167.1|	79.6	284	57	1	1	283	1	284	5.77e-150	409	100	0	0	0
+shuffled	*	-1	-1	-1	-1	-1	-1	-1	-1	-1	-1	-1	*	*	*
diff --git a/tools/diamond/test-data/protein.fasta b/tools/diamond/test-data/protein.fasta
@@ -4,3 +4,9 @@ EWIWGGFSVDKATLNRFFAFHFILFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
 LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
 GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
 IENY
+>shuffled sequence that should go to unaligned
+XLPLILMLLGISPGSFEHTVAGGIWTSLMLFLPGYPGVGFLMLLVITVPALNFKFGFMLL
+LKPTTNIIKTLVLALTHADDPLSFPWLNYMPPAADFNGLFTNAGATTTLYQIPYEGSFYL
+AAIYGSMLHENHYLYRSMTPVGWLHLGDSGLRFMLLPIYYARITYDNVPAGWFLSVNTIL
+GLTAILLEAIKALMANYSESQEPFCFSTGMKHSFIISDILGWDMSLYIILLIPHTNPFVL
+TFLTLILWLDILSRYTLLQVNLIIFMTRHGHFQIADIWYWLKS