Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add accession number to Uniprot search options #720

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tools/uniprotxml_downloader/macros.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<param name="field" type="select" label="Field">
<option value="taxonomy_name">Taxonomy Name</option>
<option value="taxonomy_id">Taxonomy ID</option>
<option value="accession">Accession</option>
</param>
</xml>
</macros>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
E1Q2I0
E1Q3C4
26 changes: 13 additions & 13 deletions tools/uniprotxml_downloader/uniprotxml_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,36 +47,36 @@ def send(self, request, **kwargs):
def __main__():
# Parse Command Line
parser = optparse.OptionParser()
parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs')
parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids')
parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids')
parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot')
parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id'], default='taxonomy_name', help='query field')
parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field')
parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
(options, args) = parser.parse_args()
taxids = set(options.taxon)
search_ids = set(options.search_id)
if options.input:
with open(options.input, 'r') as inputFile:
for linenum, line in enumerate(inputFile):
if line.startswith('#'):
continue
fields = line.rstrip('\r\n').split('\t')
if len(fields) > abs(options.column):
taxid = fields[options.column].strip()
if taxid:
taxids.add(taxid)
taxon_queries = [f'{options.field}:"{taxid}"' for taxid in taxids]
taxon_query = ' OR '.join(taxon_queries)
search_id = fields[options.column].strip()
if search_id:
search_ids.add(search_id)
search_queries = [f'{options.field}:"{search_id}"' for search_id in search_ids]
search_query = ' OR '.join(search_queries)
if options.output:
dest_path = options.output
else:
dest_path = "uniprot_%s.xml" % '_'.join(taxids)
dest_path = "uniprot_%s.xml" % '_'.join(search_ids)
reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
try:
url = 'https://rest.uniprot.org/uniprotkb/stream'
query = "%s%s" % (taxon_query, reviewed)
query = "%s%s" % (search_query, reviewed)
params = {'query': query, 'format': options.format}
if options.debug:
print("%s ? %s" % (url, params), file=sys.stderr)
Expand Down Expand Up @@ -112,7 +112,7 @@ def __main__():
else:
print("failed: Not a uniprot xml file", file=sys.stderr)
exit(1)
print("NCBI Taxon ID:%s" % taxids, file=sys.stdout)
print("Search IDs:%s" % search_ids, file=sys.stdout)
if 'X-UniProt-Release' in response.headers:
print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout)
if 'X-Total-Results' in response.headers:
Expand Down
94 changes: 64 additions & 30 deletions tools/uniprotxml_downloader/uniprotxml_downloader.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="uniprotxml_downloader" name="UniProt" version="2.3.0" profile="21.01">
<tool id="uniprotxml_downloader" name="UniProt" version="2.4.0" profile="21.01">
<description>download proteome as XML or fasta</description>
<macros>
<import>macros.xml</import>
Expand All @@ -12,32 +12,32 @@
<command>
<![CDATA[
python '$__tool_directory__/uniprotxml_downloader.py'
#if $taxid.input_choice == 'common':
--taxon $taxid.organism
#if $input_method.input_choice == 'common':
--search-id $input_method.organism
--field taxonomy_id
#if $taxid.reviewed:
--reviewed=$taxid.reviewed
#if $input_method.reviewed:
--reviewed=$input_method.reviewed
#end if
#elif $taxid.input_choice == 'taxids':
--field $taxid.field
#for $id in $taxid.taxons.split(','):
-t '$id'
#elif $input_method.input_choice == 'enter_ids':
--field $input_method.field
#for $id in $input_method.ids.split(','):
--search-id '$id'
#end for
#elif $taxid.input_choice == 'history':
--field $taxid.field
--input='${taxid.taxon_file}'
--column=#echo int(str($taxid.column)) - 1#
#elif $input_method.input_choice == 'history':
--field $input_method.field
--input='${input_method.id_file}'
--column=#echo int(str($input_method.column)) - 1#
#end if
--format $format
--output '${proteome}'
]]>
</command>
<inputs>
<conditional name="taxid">
<conditional name="input_method">
<param name="input_choice" type="select" label="Select">
<option value="common">A Common Organism</option>
<option value="taxids">A manually entered list of Taxon IDs or names</option>
<option value="history">A history dataset with a column containing Taxon IDs or names</option>
<option value="enter_ids">A manually entered list of Uniprot IDs</option>
<option value="history">A history dataset with a column containing Uniprot IDs</option>
</param>
<when value="common">
<param name="organism" type="select" label="Common Organisms"
Expand All @@ -59,16 +59,16 @@ python '$__tool_directory__/uniprotxml_downloader.py'
<option value="no">UniProtKB/TrEMBL (unreviewed only)</option>
</param>
</when>
<when value="taxids">
<param name="taxons" type="text" label="NCBI Taxon IDs or names"
help="Enter one or more Organsim IDs (separated by commas) from http://www.uniprot.org/proteomes/">
<when value="enter_ids">
<param name="ids" type="text" label="Search ID values"
help="Enter one or more IDs (separated by commas) from http://www.uniprot.org/proteomes/">
<validator type="regex" message="OrganismID[,OrganismID]">^\w+( \w+)*(,\w+( \w+)*)*$</validator>
</param>
<expand macro="query_field"/>
</when>
<when value="history">
<param name="taxon_file" type="data" format="tabular,txt" label="Dataset (tab separated) with Taxon ID/Name column"/>
<param name="column" type="data_column" data_ref="taxon_file" label="Column with Taxon ID/name"/>
<param name="id_file" type="data" format="tabular,txt" label="Dataset (tab separated) with ID column"/>
<param name="column" type="data_column" data_ref="id_file" label="Column with ID"/>
<expand macro="query_field"/>
</when>
</conditional>
Expand All @@ -86,8 +86,8 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</outputs>
<tests>
<test>
<param name="input_choice" value="taxids"/>
<param name="taxons" value="1566990"/>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="1566990"/>
<param name="format" value="xml"/>
<output name="proteome">
<assert_contents>
Expand All @@ -96,8 +96,8 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</output>
</test>
<test>
<param name="input_choice" value="taxids"/>
<param name="taxons" value="765963,512562"/>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="765963,512562"/>
<param name="field" value="taxonomy_id"/>
<param name="format" value="fasta"/>
<output name="proteome">
Expand All @@ -108,8 +108,8 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</output>
</test>
<test>
<param name="input_choice" value="taxids"/>
<param name="taxons" value="Shi470,PeCan4"/>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="Shi470,PeCan4"/>
<param name="field" value="taxonomy_name"/>
<param name="format" value="fasta"/>
<output name="proteome">
Expand All @@ -119,9 +119,21 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</assert_contents>
</output>
</test>
<test>
<param name="input_choice" value="enter_ids"/>
<param name="ids" value="E1Q2I0,E1Q3C4"/>
<param name="field" value="accession"/>
<param name="format" value="fasta"/>
<output name="proteome">
<assert_contents>
<has_text text="E1Q2I0" />
<has_text text="E1Q3C4" />
</assert_contents>
</output>
</test>
<test>
<param name="input_choice" value="history"/>
<param name="taxon_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
<param name="id_file" value="Helicobacter_strains.tsv" ftype="tabular"/>
<param name="column" value="1"/>
<param name="field" value="taxonomy_name"/>
<param name="format" value="fasta"/>
Expand All @@ -134,7 +146,7 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</test>
<test>
<param name="input_choice" value="history"/>
<param name="taxon_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
<param name="id_file" value="Helicobacter_strains_ids.tsv" ftype="tabular"/>
<param name="column" value="2"/>
<param name="field" value="taxonomy_id"/>
<param name="format" value="fasta"/>
Expand All @@ -145,6 +157,19 @@ python '$__tool_directory__/uniprotxml_downloader.py'
</assert_contents>
</output>
</test>
<test>
<param name="input_choice" value="history"/>
<param name="id_file" value="Helicobacter_protein_accessions.tsv" ftype="tabular"/>
<param name="column" value="1"/>
<param name="field" value="accession"/>
<param name="format" value="fasta"/>
<output name="proteome">
<assert_contents>
<has_text text="E1Q2I0" />
<has_text text="E1Q3C4" />
</assert_contents>
</output>
</test>
</tests>
<help>
<![CDATA[
Expand All @@ -160,7 +185,11 @@ Available taxon names: http://www.uniprot.org/taxonomy/

Example taxon: http://www.uniprot.org/taxonomy/512562

Taxon IDs or names can be entered as text or read from a column in a tabular dataset from your history.
Example protein: https://www.uniprot.org/uniprotkb/E1Q2I0/entry

Description of query fields: https://www.uniprot.org/help/query-fields

IDs can be entered as text or read from a column in a tabular dataset from your history.

Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi470) ::

Expand All @@ -171,6 +200,11 @@ Example IDs and names releated to the Bacteria Helicobacter pylori (strain Shi47
- Helicobacter
- Helicobacteraceae

Example protein accession numbers from Helicobacter pylori:

- E1Q2I0
- E1Q3C4


UniProtKB help: http://www.uniprot.org/help/uniprotkb

Expand Down