forked from schedutron/CPAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
124 lines (112 loc) · 3.8 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python2
import cStringIO
import formatter
from htmllib import HTMLParser
import httplib
import os
import sys
import urllib
import urlparse
class Retriever(object):
__slots__ = ('url', 'file')
def __init__(self, url):
self.url, self.file = self.get_file(url)
def get_file(self, url, default='index.html'):
"""Create usable local filename from URL"""
parsed = urlparse.urlparse(url)
host = parsed.netloc.split('@')[-1].split(':')[0]
filepath = '%s%s' % (host, parsed.path)
if not os.path.splitext(parsed.path)[1]:
filepath = os.path.join(filepath, default)
linkdir = os.path.dirname(filepath)
if not os.path.isdir(linkdir):
if os.path.exists(linkdir):
os.unlink(linkdir)
os.makedirs(linkdir)
return url, filepath
def download(self):
"""Download URL to specific named file"""
try:
retval = urllib.urlretrieve(self.url, self.file)
except (IOError, httplib.InvalidURL) as e:
retval = (('*** ERROR: bad URL "%s": %s' %
(self.url, e)),)
return retval
def parse_links(self):
"""Parse out the links found in downloaded HTML file"""
f = open(self.file, 'r')
data = f.read()
f.close()
parser = HTMLParser(formatter.AbstractFormatter(
formatter.DumbWriter(cStringIO.StringIO())))
parser.feed(data)
parser.close()
return parser.anchorlist
class Crawler(object):
count = 0
def __init__(self, url):
self.q = [url]
self.seen = set()
parsed = urlparse.urlparse(url)
host = parsed.netloc.split('@')[-1].split(':')[0]
self.dom = '.'.join(host.split('.')[-2:])
def get_page(self, url, media=False):
"""Download page & parse links, add to queue if nec"""
r = Retriever(url)
fname = r.download()[0]
if fname[0] == '*':
print fname, '... skipping parse'
return
Crawler.count += 1
print '\n', Crawler.count, ')'
print 'URL:', url
print 'FILE:', fname
self.seen.add(url)
ftype = os.path.splitext(fname)[1]
if ftype not in ('.htm', '.html'):
return
for link in r.parse_links():
if link.startswith('mailto:'):
print '... discarded, mailto link'
continue
if not media:
ftype = os.path.splitext(link)[1]
if ftype in ('.mp3', '.mp4', '.m4v', '.wav'):
print '... discarded, media file'
continue
if not link.startswith('http://'):
link = urlparse.urljoin(url, link)
print '*', link,
if link not in self.seen:
if self.dom not in link:
print '... discarded, not in domain'
else:
if link not in self.q:
self.q.append(link)
print '... new, added to Q'
else:
print '... discarded, already in Q'
else:
print '... discarded, already processed'
def go(self, media=False):
"""Process next page in queue (if any)"""
while self.q:
url = self.q.pop()
self.get_page(url, media)
def main():
if len(sys.argv) > 1:
url = sys.argv[1]
else:
try:
url = raw_input('Enter starting URL: ')
except (KeyboardInterrupt, EOFError):
url = ''
if not url:
return
if not url.startswith('http://') and \
not url.startswith('ftp://'):
url = 'http://%s/' % url
robot = Crawler(url)
robot.go()
if __name__ == '__main__':
main()