forked from schedutron/CPAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_links.py
72 lines (61 loc) · 1.99 KB
/
parse_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
from HTMLParser import HTMLParser
from cStringIO import StringIO
from urllib2 import urlopen
from urlparse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
from html5lib import parse, treebuilders
URLs = (
'http://python.org',
'http://google.com'
)
def output(x):
print '\n'.join(sorted(set(x)))
def simpleBS(url, f):
'simpleBS() - use BeautifulSoup to parse all tags to get anchors'
output(urljoin(url, x['href']) for x in BeautifulSoup(
f).findAll('a'))
def fasterBS(url, f):
'fasterBS() - use BeautifulSoup to parse only the anchor tags'
output(urljoin(url, x['href']) for x in BeautifulSoup(
f, 'lxml', parse_only=SoupStrainer('a')))
def htmlparser(url, f):
'htmlparser() - use HTMLParser to parse anchor tags'
class AnchorParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
if not hasattr(self, 'data'):
self.data = []
for attr in attrs:
if attr[0] == 'href':
self.data.append(attr[1])
parser = AnchorParser()
parser.feed(f.read())
output(urljoin(url, x) for x in parser.data)
def html5libparse(url, f):
'html5libparse() - use html5lib to parse anchor tags'
output(urljoin(url, x.attributes['href']) \
for x in parse(f) if isinstance(x,
treebuilders.etree.ElementTree) and \
x.name == 'a')
def process(url, data):
print '\n*** simple BS'
simpleBS(url, data)
data.seek(0)
print '\n*** faster BS'
#fasterBS(url, data) - perhaps there's a bug in bs4, not working as of now
data.seek(0)
print '\n*** HTMLParser'
htmlparser(url, data)
data.seek(0)
print '\n*** HTML5lib'
#html5libparse(url, data) - not working; some parts deprecated.
def main():
for url in URLs:
f = urlopen(url)
data = StringIO(f.read())
f.close()
process(url, data)
if __name__ == '__main__':
main()