Skip to content

Commit

Permalink
Merge pull request #1427 from zargham-ahmad/issue_1424
Browse files Browse the repository at this point in the history
Fix bug in split_file_to_collection
  • Loading branch information
bgruening authored May 23, 2024
2 parents da47614 + eb3d855 commit 5d21f3d
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 28 deletions.
6 changes: 5 additions & 1 deletion tools/text_processing/split_file_to_collection/.shed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ owner: bgruening
name: split_file_to_collection
categories:
- Text Manipulation
description: Split tabular, MGF, FASTA, or FASTQ files to a dataset collection.
description: Split tabular, MGF, FASTA, or FASTQ files to a dataset collection.
long_description: |
Split file into a dataset collection.
Splits a data set consisting of records into multiple data sets within a collection.
remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection
homepage_url: https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection
type: unrestricted
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ def split_by_record(args, in_file, out_dir, top, ftype):
else:
record += line
# after loop, write final record to file
if new_file_counter in fresh_files:
new_file.write(header)
new_file.write(record)
new_file.close()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="split_file_to_collection" name="Split file" version="0.5.1">
<tool id="split_file_to_collection" name="Split file" version="0.5.2">
<description>to dataset collection</description>
<macros>
<xml name="regex_sanitizer">
Expand Down Expand Up @@ -190,38 +190,38 @@
</conditional>
</inputs>
<outputs>
<collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: tabular">
<discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/>
<filter>split_parms['select_ftype'] == "tabular"</filter>
</collection>
<collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: mgf">
<discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/>
<filter>split_parms['select_ftype'] == "mgf"</filter>
</collection>
<collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: fasta">
<discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/>
<filter>split_parms['select_ftype'] == "fasta"</filter>
</collection>
<collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: fastq">
<discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
<filter>split_parms['select_ftype'] == "fastq"</filter>
</collection>
<collection name="list_output_sdf" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_sdf" type="list" label="${tool.name} on ${on_string}: sdf">
<discover_datasets pattern="__name__" directory="out" visible="false" format="sdf"/>
<filter>split_parms['select_ftype'] == "sdf"</filter>
</collection>
<collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}: txt">
<discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/>
<filter>split_parms['select_ftype'] == "txt"</filter>
</collection>
<collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}">
<collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}: generic">
<discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/>
<filter>split_parms['select_ftype'] == "generic"</filter>
</collection>
</outputs>
<tests>
<!-- 1 -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="tabular"/>
<param name="select_ftype" value="tabular"/>
<param name="select_split_by" value="col"/>
Expand All @@ -236,7 +236,7 @@
</output_collection>
</test>
<!-- 2 -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="tabular"/>
<param name="select_ftype" value="tabular"/>
<param name="select_split_by" value="row"/>
Expand All @@ -250,7 +250,7 @@
</output_collection>
</test>
<!-- 3 -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="tabular"/>
<param name="select_ftype" value="tabular"/>
<param name="select_split_by" value="row"/>
Expand All @@ -265,7 +265,7 @@
</output_collection>
</test>
<!-- 4 -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="tabular"/>
<param name="select_ftype" value="tabular"/>
<param name="select_split_by" value="row"/>
Expand All @@ -280,7 +280,7 @@
</output_collection>
</test>
<!-- 5 -->
<test>
<test expect_num_outputs="1">
<param name="select_ftype" value="txt"/>
<param name="input" value="karyotype.txt" ftype="txt"/>
<param name="mode" value="numnew"/>
Expand Down Expand Up @@ -316,7 +316,7 @@
</output_collection>
</test>
<!-- 6 -->
<test>
<test expect_num_outputs="1">
<param name="input" value="psm.tabular" ftype="tabular"/>
<param name="select_ftype" value="tabular"/>
<param name="select_split_by" value="col"/>
Expand All @@ -332,7 +332,7 @@
</output_collection>
</test>
<!-- 7 splitting of mgf -->
<test>
<test expect_num_outputs="1">
<param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
<param name="select_ftype" value="mgf"/>
<param name="mode" value="numnew"/>
Expand All @@ -345,7 +345,7 @@
</output_collection>
</test>
<!-- 8 splitting of fasta + desired number of files-->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fasta" ftype="fasta"/>
<param name="select_ftype" value="fasta"/>
<param name="mode" value="numnew"/>
Expand All @@ -357,7 +357,7 @@
</output_collection>
</test>
<!-- 9 splitting of fasta + desired chunksize -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fasta" ftype="fasta"/>
<param name="select_ftype" value="fasta"/>
<param name="mode" value="chunk"/>
Expand All @@ -369,7 +369,7 @@
</output_collection>
</test>
<!-- 10 splitting of fastq, specify desired number of files -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fastq" ftype="fastq"/>
<param name="select_ftype" value="fastq"/>
<param name="mode" value="numnew"/>
Expand All @@ -383,7 +383,7 @@
<!-- 11 splitting of fastq, specify desired number of files
same as previous test, but by specifying the number of lines per record
explicitely (not using the preset of the python script) -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fastq" ftype="fastq"/>
<param name="select_ftype" value="generic"/>
<param name="select_split_method" value="number"/>
Expand All @@ -397,7 +397,7 @@
</output_collection>
</test>
<!-- splitting of fasta w random assignment and specific filename prefix -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fasta" ftype="fasta"/>
<param name="select_ftype" value="fasta"/>
<param name="mode" value="numnew"/>
Expand All @@ -411,7 +411,7 @@
</output_collection>
</test>
<!-- splitting of fasta w batch assignment and specific filename prefix -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fasta" ftype="fasta"/>
<param name="select_ftype" value="fasta"/>
<param name="mode" value="numnew"/>
Expand All @@ -424,7 +424,7 @@
</output_collection>
</test>
<!-- splitting of txt w default (alternating assignment) -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="txt"/>
<param name="select_ftype" value="txt"/>
<param name="mode" value="numnew"/>
Expand All @@ -436,7 +436,7 @@
</output_collection>
</test>
<!-- generic-regex splitting (of txt) w default assignement (alternating) -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="txt"/>
<param name="select_ftype" value="generic"/>
<param name="select_split_method" value="regex"/>
Expand All @@ -450,7 +450,7 @@
</output_collection>
</test>
<!-- generic-regex splitting (of a fasta) w random assignment -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fasta" ftype="fasta"/>
<param name="select_ftype" value="generic"/>
<param name="select_split_method" value="regex"/>
Expand All @@ -466,7 +466,7 @@
</output_collection>
</test>
<!-- sdf + specify desired number of files -->
<test>
<test expect_num_outputs="1">
<param name="input" value="3_molecules.sdf" ftype="sdf"/>
<param name="select_ftype" value="sdf"/>
<param name="mode" value="numnew"/>
Expand All @@ -480,7 +480,7 @@
</output_collection>
</test>
<!-- sdf + specify desired number of records per file (chunksize) -->
<test>
<test expect_num_outputs="1">
<param name="input" value="3_molecules.sdf" ftype="sdf"/>
<param name="select_ftype" value="sdf"/>
<param name="mode" value="chunk"/>
Expand All @@ -494,7 +494,7 @@
</output_collection>
</test>
<!-- test split_after (by splitting fasta files after non-header lines) -->
<test>
<test expect_num_outputs="1">
<param name="input" value="test.fasta" ftype="fasta"/>
<param name="select_ftype" value="generic"/>
<param name="select_split_method" value="regex"/>
Expand All @@ -510,6 +510,21 @@
<element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
</output_collection>
</test>
<test expect_num_outputs="1">
<param name="input" value="test.tabular" ftype="tabular"/>
<param name="select_ftype" value="tabular"/>
<param name="select_split_by" value="row"/>
<param name="top" value="2"/>
<param name="mode" value="chunk"/>
<param name="chunksize" value="1"/>
<param name="allocate" value="batch"/>
<output_collection name="list_output_tab" type="list">
<element name="split_file_000000.tabular" file="split_file_0.tabular" ftype="tabular"/>
<element name="split_file_000001.tabular" file="split_file_1.tabular" ftype="tabular"/>
<element name="split_file_000002.tabular" file="split_file_2.tabular" ftype="tabular"/>
<element name="split_file_000003.tabular" file="split_file_3.tabular" ftype="tabular"/>
</output_collection>
</test>
</tests>
<help><![CDATA[
**Split file into a dataset collection**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#This is a file
#file data
foo.mgf bar
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#This is a file
#file data
foo2.mgf bar2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#This is a file
#file data
foo3.mgf bar3
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#This is a file
#file data
foo.mgf bar4

0 comments on commit 5d21f3d

Please sign in to comment.