-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathparse_websites.py
85 lines (69 loc) · 3.28 KB
/
parse_websites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Parse some websites and print some information
"""
import os
import sys
import re
import argparse
from bs4 import BeautifulSoup
__author__ = 'Rob Edwards'
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Parse a directory of websites')
parser.add_argument('-d', help='directory of files', required=True)
args = parser.parse_args()
for f in os.listdir(args.d):
try:
aid = f.replace('.aspx', '')
soup = BeautifulSoup(open(os.path.join(args.d, f), 'r'), 'html.parser')
desc = soup.find('meta', attrs={'name':"description"})
if desc:
descc = desc['content']
else:
descc = "No description"
if "Designation" in descc:
parts=re.search('^(.*?)\s+Designation:\s+(.*?)\s+TypeStrain=(\S+)\s+Application:\s*(.*?)$', descc)
if parts:
(org, desig, typestrain, application)=parts.groups()
org = org.replace('®', '')
org = org.replace('™', '')
print("Name\t{}\nDesignation\t{}\nType Strain\t{}\nApplication\t{}\n".format(org, desig, typestrain, application))
else:
sys.stderr.write("Malformed description in {}\n".format(os.path.join(args.d, f)))
elif descc:
print("Name\t{}\n".format(descc))
else:
sys.stderr.write("No description in {}\n".format(os.path.join(args.d, f)))
organism = soup.title.get_text()
organism = organism.replace('\n', '; ')
organism = organism.replace('\r', '')
(organism, n) = re.subn(';\s+;', '; ', organism)
while (n > 0):
(organism, n) = re.subn(';\s+;', '; ', organism)
organism = re.sub(';\s+', '; ', organism)
organism = re.sub('\s+:\s+;\s+', ' : ', organism)
organism = re.sub('^[\s\;]+', '', organism)
organism = re.sub('[\s\;]+$', '', organism)
print("Organism\t{}".format(organism))
print("ID\t{}".format(aid))
for table in soup.find_all('table', class_="fulllist"):
for row in table.find_all('tr'):
header = row.find('th').get_text()
cell = row.find('td').get_text()
if header:
header = header.strip()
# header = header.encode('ascii', 'ignore')
if cell:
# cell = cell.encode('ascii', 'ignore')
cell = cell.replace('\n', '; ')
cell = cell.replace('\r', '')
(cell, n) = re.subn(';\s+;', '; ', cell)
while (n > 0):
(cell, n) = re.subn(';\s+;', '; ', cell)
cell = re.sub(';\s+', '; ', cell)
cell = re.sub('\s+:\s+;\s+', ' : ', cell)
cell = re.sub('^[\s\;]+', '', cell)
cell = re.sub('[\s\;]+$', '', cell)
print("{}\t{}".format(header, cell))
print("//")
except Exception as err:
sys.stderr.write("There was an error parsing {}\n{}\n. Skipped\n".format(f, err))