Merge pull request #1427 from zargham-ahmad/issue_1424

Fix bug in split_file_to_collection
bgruening · May 23, 2024 · 5d21f3d · 5d21f3d
2 parents da47614 + eb3d855
commit 5d21f3d
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 28 deletions.
diff --git a/tools/text_processing/split_file_to_collection/.shed.yml b/tools/text_processing/split_file_to_collection/.shed.yml
@@ -2,6 +2,10 @@ owner: bgruening
 name: split_file_to_collection
 categories: 
   - Text Manipulation
-description: Split tabular, MGF, FASTA, or FASTQ files to a dataset collection. 
+description: Split tabular, MGF, FASTA, or FASTQ files to a dataset collection.
+long_description: |
+    Split file into a dataset collection.
+    Splits a data set consisting of records into multiple data sets within a collection.
 remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection
+homepage_url: https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection
 type: unrestricted
diff --git a/tools/text_processing/split_file_to_collection/split_file_to_collection.py b/tools/text_processing/split_file_to_collection/split_file_to_collection.py
@@ -329,6 +329,8 @@ def split_by_record(args, in_file, out_dir, top, ftype):
             else:
                 record += line
         # after loop, write final record to file
+        if new_file_counter in fresh_files:
+            new_file.write(header)
         new_file.write(record)
         new_file.close()
 

diff --git a/tools/text_processing/split_file_to_collection/split_file_to_collection.xml b/tools/text_processing/split_file_to_collection/split_file_to_collection.xml
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.5.1">
+<tool id="split_file_to_collection" name="Split file" version="0.5.2">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">
@@ -190,38 +190,38 @@
         </conditional>
     </inputs>
     <outputs>
-        <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: tabular">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/>
             <filter>split_parms['select_ftype'] == "tabular"</filter>
         </collection>
-        <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: mgf">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/>
             <filter>split_parms['select_ftype'] == "mgf"</filter>
         </collection>
-        <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: fasta">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/>
             <filter>split_parms['select_ftype'] == "fasta"</filter>
         </collection>
-        <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: fastq">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
             <filter>split_parms['select_ftype'] == "fastq"</filter>
         </collection>
-        <collection name="list_output_sdf" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_sdf" type="list" label="${tool.name} on ${on_string}: sdf">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="sdf"/>
             <filter>split_parms['select_ftype'] == "sdf"</filter>
         </collection>
-        <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}: txt">
             <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/>
             <filter>split_parms['select_ftype'] == "txt"</filter>
         </collection>
-        <collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}">
+        <collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}: generic">
             <discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/>
             <filter>split_parms['select_ftype'] == "generic"</filter>
         </collection>
     </outputs>
     <tests>
         <!-- 1 -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="col"/>
@@ -236,7 +236,7 @@
             </output_collection>
         </test>
         <!-- 2 -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="row"/>
@@ -250,7 +250,7 @@
             </output_collection>
         </test>
         <!-- 3 -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="row"/>
@@ -265,7 +265,7 @@
             </output_collection>
         </test>
         <!-- 4 -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="row"/>
@@ -280,7 +280,7 @@
             </output_collection>
         </test>
         <!-- 5 -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="select_ftype" value="txt"/>
             <param name="input" value="karyotype.txt" ftype="txt"/>
             <param name="mode" value="numnew"/>
@@ -316,7 +316,7 @@
             </output_collection>
         </test>
         <!-- 6 -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="psm.tabular" ftype="tabular"/>
             <param name="select_ftype" value="tabular"/>
             <param name="select_split_by" value="col"/>
@@ -332,7 +332,7 @@
             </output_collection>
         </test>
         <!-- 7 splitting of mgf -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
             <param name="select_ftype" value="mgf"/>
             <param name="mode" value="numnew"/>
@@ -345,7 +345,7 @@
             </output_collection>
         </test>
         <!-- 8 splitting of fasta + desired number of files-->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
             <param name="mode" value="numnew"/>
@@ -357,7 +357,7 @@
             </output_collection>
         </test>
         <!-- 9 splitting of fasta + desired chunksize -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
             <param name="mode" value="chunk"/>
@@ -369,7 +369,7 @@
             </output_collection>
         </test>
         <!-- 10 splitting of fastq, specify desired number of files -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fastq" ftype="fastq"/>
             <param name="select_ftype" value="fastq"/>
             <param name="mode" value="numnew"/>
@@ -383,7 +383,7 @@
         <!-- 11 splitting of fastq, specify desired number of files 
              same as previous test, but by specifying the number of lines per record
              explicitely (not using the preset of the python script) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fastq" ftype="fastq"/>
             <param name="select_ftype" value="generic"/>
             <param name="select_split_method" value="number"/>
@@ -397,7 +397,7 @@
             </output_collection>
         </test>
         <!-- splitting of fasta w random assignment and specific filename prefix -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
             <param name="mode" value="numnew"/>
@@ -411,7 +411,7 @@
             </output_collection>
         </test>
         <!-- splitting of fasta w batch assignment and specific filename prefix -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="fasta"/>
             <param name="mode" value="numnew"/>
@@ -424,7 +424,7 @@
             </output_collection>
         </test>
         <!-- splitting of txt w default (alternating assignment) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="txt"/>
             <param name="mode" value="numnew"/>
@@ -436,7 +436,7 @@
             </output_collection>
         </test>
         <!-- generic-regex splitting (of txt) w default assignement (alternating) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.tabular" ftype="txt"/>
             <param name="select_ftype" value="generic"/>
             <param name="select_split_method" value="regex"/>
@@ -450,7 +450,7 @@
             </output_collection>
         </test>
         <!-- generic-regex splitting (of a fasta) w random assignment -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
             <param name="select_split_method" value="regex"/>
@@ -466,7 +466,7 @@
             </output_collection>
         </test>
         <!-- sdf + specify desired number of files -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
             <param name="select_ftype" value="sdf"/>
             <param name="mode" value="numnew"/>
@@ -480,7 +480,7 @@
             </output_collection>
         </test>
         <!-- sdf + specify desired number of records per file (chunksize) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="3_molecules.sdf" ftype="sdf"/>
             <param name="select_ftype" value="sdf"/>
             <param name="mode" value="chunk"/>
@@ -494,7 +494,7 @@
             </output_collection>
         </test>
         <!-- test split_after (by splitting fasta files after non-header lines) -->
-        <test>
+        <test expect_num_outputs="1">
             <param name="input" value="test.fasta" ftype="fasta"/>
             <param name="select_ftype" value="generic"/>
             <param name="select_split_method" value="regex"/>
@@ -510,6 +510,21 @@
                 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
             </output_collection>
         </test>
+        <test expect_num_outputs="1">
+            <param name="input" value="test.tabular" ftype="tabular"/>
+            <param name="select_ftype" value="tabular"/>
+            <param name="select_split_by" value="row"/>
+            <param name="top" value="2"/>
+            <param name="mode" value="chunk"/>
+            <param name="chunksize" value="1"/>
+            <param name="allocate" value="batch"/>
+            <output_collection name="list_output_tab" type="list">
+                <element name="split_file_000000.tabular" file="split_file_0.tabular" ftype="tabular"/>
+                <element name="split_file_000001.tabular" file="split_file_1.tabular" ftype="tabular"/>
+                <element name="split_file_000002.tabular" file="split_file_2.tabular" ftype="tabular"/>
+                <element name="split_file_000003.tabular" file="split_file_3.tabular" ftype="tabular"/>
+            </output_collection>
+        </test>
     </tests>
     <help><![CDATA[
 **Split file into a dataset collection**

diff --git a/tools/text_processing/split_file_to_collection/test-data/split_file_0.tabular b/tools/text_processing/split_file_to_collection/test-data/split_file_0.tabular
@@ -0,0 +1,3 @@
+#This is a file
+#file   data
+foo.mgf	bar
diff --git a/tools/text_processing/split_file_to_collection/test-data/split_file_1.tabular b/tools/text_processing/split_file_to_collection/test-data/split_file_1.tabular
@@ -0,0 +1,3 @@
+#This is a file
+#file   data
+foo2.mgf	bar2
diff --git a/tools/text_processing/split_file_to_collection/test-data/split_file_2.tabular b/tools/text_processing/split_file_to_collection/test-data/split_file_2.tabular
@@ -0,0 +1,3 @@
+#This is a file
+#file   data
+foo3.mgf	bar3
diff --git a/tools/text_processing/split_file_to_collection/test-data/split_file_3.tabular b/tools/text_processing/split_file_to_collection/test-data/split_file_3.tabular
@@ -0,0 +1,3 @@
+#This is a file
+#file   data
+foo.mgf	bar4