Skip to content

Commit

Permalink
Generalize tool parameters and naming conventions for searching any f…
Browse files Browse the repository at this point in the history
…ield in Uniprot, not just taxon-related fields. Add accession number as a search option. (#720)
  • Loading branch information
reid-wagner authored Jul 6, 2023
1 parent 21426ab commit 0c52223
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 43 deletions.
1 change: 1 addition & 0 deletions tools/uniprotxml_downloader/macros.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<param name="field" type="select" label="Field">
<option value="taxonomy_name">Taxonomy Name</option>
<option value="taxonomy_id">Taxonomy ID</option>
<option value="accession">Accession</option>
</param>
</xml>
</macros>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
E1Q2I0
E1Q3C4
26 changes: 13 additions & 13 deletions tools/uniprotxml_downloader/uniprotxml_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,36 +47,36 @@ def send(self, request, **kwargs):
def __main__():
# Parse Command Line
parser = optparse.OptionParser()
parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs')
parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids')
parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids')
parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot')
parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id'], default='taxonomy_name', help='query field')
parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field')
parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
(options, args) = parser.parse_args()
taxids = set(options.taxon)
search_ids = set(options.search_id)
if options.input:
with open(options.input, 'r') as inputFile:
for linenum, line in enumerate(inputFile):
if line.startswith('#'):
continue
fields = line.rstrip('\r\n').split('\t')
if len(fields) > abs(options.column):
taxid = fields[options.column].strip()
if taxid:
taxids.add(taxid)
taxon_queries = [f'{options.field}:"{taxid}"' for taxid in taxids]
taxon_query = ' OR '.join(taxon_queries)
search_id = fields[options.column].strip()
if search_id:
search_ids.add(search_id)
search_queries = [f'{options.field}:"{search_id}"' for search_id in search_ids]
search_query = ' OR '.join(search_queries)
if options.output:
dest_path = options.output
else:
dest_path = "uniprot_%s.xml" % '_'.join(taxids)
dest_path = "uniprot_%s.xml" % '_'.join(search_ids)
reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
try:
url = 'https://rest.uniprot.org/uniprotkb/stream'
query = "%s%s" % (taxon_query, reviewed)
query = "%s%s" % (search_query, reviewed)
params = {'query': query, 'format': options.format}
if options.debug:
print("%s ? %s" % (url, params), file=sys.stderr)
Expand Down Expand Up @@ -112,7 +112,7 @@ def __main__():
else:
print("failed: Not a uniprot xml file", file=sys.stderr)
exit(1)
print("NCBI Taxon ID:%s" % taxids, file=sys.stdout)
print("Search IDs:%s" % search_ids, file=sys.stdout)
if 'X-UniProt-Release' in response.headers:
print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout)
if 'X-Total-Results' in response.headers:
Expand Down
94 changes: 64 additions & 30 deletions tools/uniprotxml_downloader/uniprotxml_downloader.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="uniprotxml_downloader" name="UniProt" version="2.3.0" profile="21.01">
<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="21.01">
<description>download proteome as XML or fasta</description>
<macros>
<import>macros.xml</import>
Expand All @@ -12,32 +12,32 @@
<command>
<![CDATA[
python '$__tool_directory__/uniprotxml_downloader.py'
#if $taxid.input_choice == 'common':
--taxon $taxid.organism
#if $input_method.input_choice == 'common':
--search-id $input_method.organism
--field taxonomy_id
#if $taxid.reviewed:
--reviewed=$taxid.reviewed
#if $input_method.reviewed:
--reviewed=$input_method.reviewed
#end if
#elif $taxid.input_choice == 'taxids':
--field $taxid.field
#for $id in $taxid.taxons.split(','):
-t '$id'
#elif $input_method.input_choice == 'enter_ids':
--field $input_method.field
#for $id in $input_method.ids.split(','):
--search-id '$id'
#end for
#elif $taxid.input_choice == 'history':
--field $taxid.field
--input='${taxid.taxon_file}'
--column=#echo int(str($taxid.column)) - 1#
#elif $input_method.input_choice == 'history':
--field $input_method.field
--input='${input_method.id_file}'
--column=#echo int(str($input_method.column)) - 1#
#end if
--format $format
--output '${proteome}'
]]>
</command>
<inputs>
<conditional name="taxid">
<conditional name="input_method">
<param name="input_choice" type="select" label="Select">
<option value="common">A Common Organism</option>
<option value="taxids">A manually entered list of Taxon IDs or names</option>
<option value="history">A history dataset with a column containing Taxon IDs or names</option>
<option value="enter_ids">A manually entered list of Uniprot IDs</option>
<option value="history">A history dataset with a column containing Uniprot IDs</option>
</param>
<when value="common">
<param name="organism" type="select" label="Common Organisms"
Expand All @@ -59,16 +59,16 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<option value="no">UniProtKB/TrEMBL (unreviewed only)</option>
</param>
</when>
<when value="taxids">
<param name="taxons" type="text" label="NCBI Taxon IDs or names"
help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/">
<when value="enter_ids">
<param name="ids" type="text" label="Search ID values"
help="Enter one or more IDs (separated by commas) from http://www.uniprot.org/proteomes/">
<validator type="regex" message="OrganismID[,OrganismID]">^\w+( \w+)*(,\w+( \w+)*)*$</validator>
</param>
<expand macro="query_field"/>
</when>
<when value="history">
<param name="taxon_file" type="data" format="tabular,txt" label="Dataset (tab separated) with Taxon ID/Name column"/>
<param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon ID/name"/>
<param name="id_file" type="data" format="tabular,txt" label="Dataset (tab separated) with ID column"/>
<param name="column" type="data_column" data_ref="id_file" label="Column with ID"/>
<expand macro="query_field"/>
</when>
</conditional>
Expand All @@ -86,8 +86,8 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</outputs>
<tests>
<test>
<param name="input_choice" value="taxids"/>
<param name="taxons" value="1566990"/>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="1566990"/>
<param name="format" value="xml"/>
<output name="proteome">
<assert_contents>
Expand All @@ -96,8 +96,8 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</output>
</test>
<test>
<param name="input_choice" value="taxids"/>
<param name="taxons" value="765963,512562"/>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="765963,512562"/>
<param name="field" value="taxonomy_id"/>
<param name="format" value="fasta"/>
<output name="proteome">
Expand All @@ -108,8 +108,8 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</output>
</test>
<test>
<param name="input_choice" value="taxids"/>
<param name="taxons" value="Shi470,PeCan4"/>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="Shi470,PeCan4"/>
<param name="field" value="taxonomy_name"/>
<param name="format" value="fasta"/>
<output name="proteome">
Expand All @@ -119,9 +119,21 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</assert_contents>
</output>
</test>
<test>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="E1Q2I0,E1Q3C4"/>
<param name="field" value="accession"/>
<param name="format" value="fasta"/>
<output name="proteome">
<assert_contents>
<has_text text="E1Q2I0" />
<has_text text="E1Q3C4" />
</assert_contents>
</output>
</test>
<test>
<param name="input_choice" value="history"/>
<param name="taxon_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
<param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
<param name="column" value="1"/>
<param name="field" value="taxonomy_name"/>
<param name="format" value="fasta"/>
Expand All @@ -134,7 +146,7 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</test>
<test>
<param name="input_choice" value="history"/>
<param name="taxon_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
<param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
<param name="column" value="2"/>
<param name="field" value="taxonomy_id"/>
<param name="format" value="fasta"/>
Expand All @@ -145,6 +157,19 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</assert_contents>
</output>
</test>
<test>
<param name="input_choice" value="history"/>
<param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/>
<param name="column" value="1"/>
<param name="field" value="accession"/>
<param name="format" value="fasta"/>
<output name="proteome">
<assert_contents>
<has_text text="E1Q2I0" />
<has_text text="E1Q3C4" />
</assert_contents>
</output>
</test>
</tests>
<help>
<![CDATA[
Expand All @@ -160,7 +185,11 @@ Available taxon names: http://www.uniprot.org/taxonomy/
Example taxon: http://www.uniprot.org/taxonomy/512562
Taxon IDs or names can be entered as text or read from a column in a tabular dataset from your history.
Example protein: https://www.uniprot.org/uniprotkb/E1Q2I0/entry
Description of query fields: https://www.uniprot.org/help/query-fields
IDs can be entered as text or read from a column in a tabular dataset from your history.
Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi470) ::
Expand All @@ -171,6 +200,11 @@ Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi47
- Helicobacter
- Helicobacteraceae
Example protein accession numbers from Helicobacter pylori:
- E1Q2I0
- E1Q3C4
UniProtKB help: http://www.uniprot.org/help/uniprotkb
Expand Down

0 comments on commit 0c52223

Please sign in to comment.