forked from schedutron/CPAP
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Complete Example 9-4: Link Parser; some parts not working as they're …
…deprecated
- Loading branch information
1 parent
809fba9
commit 46bd3e1
Showing
1 changed file
with
72 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#!/usr/bin/env python | ||
|
||
from HTMLParser import HTMLParser | ||
from cStringIO import StringIO | ||
from urllib2 import urlopen | ||
from urlparse import urljoin | ||
|
||
from bs4 import BeautifulSoup, SoupStrainer | ||
from html5lib import parse, treebuilders | ||
|
||
URLs = ( | ||
'http://python.org', | ||
'http://google.com' | ||
) | ||
|
||
def output(x): | ||
print '\n'.join(sorted(set(x))) | ||
|
||
def simpleBS(url, f): | ||
'simpleBS() - use BeautifulSoup to parse all tags to get anchors' | ||
output(urljoin(url, x['href']) for x in BeautifulSoup( | ||
f).findAll('a')) | ||
|
||
def fasterBS(url, f): | ||
'fasterBS() - use BeautifulSoup to parse only the anchor tags' | ||
output(urljoin(url, x['href']) for x in BeautifulSoup( | ||
f, 'lxml', parse_only=SoupStrainer('a'))) | ||
|
||
def htmlparser(url, f): | ||
'htmlparser() - use HTMLParser to parse anchor tags' | ||
class AnchorParser(HTMLParser): | ||
def handle_starttag(self, tag, attrs): | ||
if tag != 'a': | ||
return | ||
if not hasattr(self, 'data'): | ||
self.data = [] | ||
for attr in attrs: | ||
if attr[0] == 'href': | ||
self.data.append(attr[1]) | ||
parser = AnchorParser() | ||
parser.feed(f.read()) | ||
output(urljoin(url, x) for x in parser.data) | ||
|
||
def html5libparse(url, f): | ||
'html5libparse() - use html5lib to parse anchor tags' | ||
output(urljoin(url, x.attributes['href']) \ | ||
for x in parse(f) if isinstance(x, | ||
treebuilders.etree.ElementTree) and \ | ||
x.name == 'a') | ||
|
||
def process(url, data): | ||
print '\n*** simple BS' | ||
simpleBS(url, data) | ||
data.seek(0) | ||
print '\n*** faster BS' | ||
#fasterBS(url, data) - perhaps there's a bug in bs4, not working as of now | ||
data.seek(0) | ||
print '\n*** HTMLParser' | ||
htmlparser(url, data) | ||
data.seek(0) | ||
print '\n*** HTML5lib' | ||
#html5libparse(url, data) - not working; some parts deprecated. | ||
|
||
def main(): | ||
for url in URLs: | ||
f = urlopen(url) | ||
data = StringIO(f.read()) | ||
f.close() | ||
process(url, data) | ||
|
||
if __name__ == '__main__': | ||
main() |