-
Notifications
You must be signed in to change notification settings - Fork 0
/
FindClade.py
executable file
·70 lines (56 loc) · 2.53 KB
/
FindClade.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
from __future__ import print_function
""" Takes a taxonomical term (scientific/common species name), and returns
a higher clade name (family, order, phylum...)"""
import sys
import requests
import bs4
import argparse
import fileinput
# Seem to be forbidden when not http secure.
URL = "https://www.ncbi.nlm.nih.gov/taxonomy/"
URL_ASSEMBLY = "https://www.ncbi.nlm.nih.gov/assembly/"
REPORT = "info"
def iter_clades(names, levels=['family'], http=False, na='NA', assembly=False):
# Check if names are given as arguments of the script. If not, read stdin.
# TODO: error when no data provided to stdin after some time lapse.
url = URL.replace('https', 'http') if http else URL
if not names: names = [line.rstrip() for line in fileinput.input(('-',))]
for name in names:
data = {"term": "%s" % name,
"report": REPORT}
r = requests.post(url, data=data)
if not r.ok:
print("Reason:", r.reason, file=sys.stderr)
raise r.raise_for_status()
soup = bs4.BeautifulSoup(r.text, 'lxml')
out = {level: na for level in levels}
found_levels = 0
for dl in soup.findAll('dl'): # description lists
if dl.dt.text == "Lineage:": # title of the description list item
for a in dl.findAll('a'): # link
atitle = a.attrs.get('title')
if atitle in levels:
out[atitle] = a.text
found_levels += 1
if found_levels == len(levels):
break
break
yield [out[level] for level in levels]
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-l", "--levels", default="family",
help="taxonomic levels to fetch, comma-separated [%(default)s]")
parser.add_argument("names", nargs='*',
help="species names to convert, if empty, read from stdin")
parser.add_argument("--http", action='store_true',
help="Use http:// instead of https://")
parser.add_argument("--NAempty", action='store_const', dest='na',
const='', default='NA',
help="prints the empty string when not found (otherwise %(default)r)")
kwargs = vars(parser.parse_args())
kwargs['levels'] = [x.strip() for x in kwargs['levels'].split(',')]
for clades in iter_clades(**kwargs):
print('\t'.join(clades))
if __name__ == '__main__':
main()