forked from mikejs/scrapelib
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrapelib.py
448 lines (352 loc) · 14.1 KB
/
scrapelib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
import os
import sys
import time
import logging
import tempfile
import urllib2
import urlparse
import datetime
import cookielib
import robotparser
try:
import json
except ImportError:
import simplejson as json
try:
import httplib2
USE_HTTPLIB2 = True
except ImportError:
USE_HTTPLIB2 = False
__version__ = '0.3.0-dev'
_user_agent = 'scrapelib %s' % __version__
class NullHandler(logging.Handler):
def emit(self, record):
pass
_log = logging.getLogger('scrapelib')
_log.addHandler(NullHandler())
class ScrapeError(Exception):
pass
class RobotExclusionError(ScrapeError):
"""
Raised when an attempt is made to access a page denied by
the host's robots.txt file.
"""
def __init__(self, message, url, user_agent):
super(RobotExclusionError, self).__init__(message)
self.url = url
self.user_agent = user_agent
class HTTPMethodUnavailableError(ScrapeError):
"""
Raised when the supplied HTTP method is invalid or not supported
by the HTTP backend.
"""
def __init__(self, message, method):
super(HTTPMethodUnavailableError, self).__init__(message)
self.method = method
class HTTPError(ScrapeError):
"""
Raised when urlopen encounters a 4xx or 5xx error code and the
raise_errors option is true.
"""
def __init__(self, response, body):
message = '%s while retrieving %s' % (response.code, response.url)
super(HTTPError, self).__init__(message)
self.response = response
self.body = body
class ErrorManager(object):
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type and self._scraper.save_errors:
self._scraper._save_error(self.response.url, self)
return False
class ResultStr(str, ErrorManager):
def __new__(cls, scraper, response, str):
self = str.__new__(cls, str)
self._scraper = scraper
self.response = response
return self
class ResultUnicode(unicode, ErrorManager):
def __new__(cls, scraper, response, str):
self = unicode.__new__(cls, str)
self._scraper = scraper
self.response = response
return self
class Headers(dict):
def __init__(self, d={}):
super(Headers, self).__init__()
for k, v in d.items():
self[k] = v
def __getitem__(self, key):
return super(Headers, self).__getitem__(key.lower())
def __setitem__(self, key, value):
super(Headers, self).__setitem__(key.lower(), value)
def __delitem__(self, key):
return super(Headers, self).__delitem__(key.lower())
def __contains__(self, key):
return super(Headers, self).__contains__(key.lower())
def __eq__(self, other):
for k, v in other.items():
if self[k] != v:
return False
return True
def getallmatchingheaders(self, name):
header = self.get(name)
if header:
return [name + ": " + header]
return []
def getheaders(self, name):
header = self.get(name.lower())
if header:
return [header]
return []
class Response(object):
def __init__(self, url, requested_url, protocol='http', code=None,
fromcache=False, headers={}):
"""
:param url: the actual URL of the response (after following any
redirects)
:param requested_url: the original URL requested
:param code: response code (if HTTP)
:param fromcache: response was retrieved from local cache
"""
self.url = url
self.requested_url = requested_url
self.protocol = protocol
self.code = code
self.fromcache = fromcache
self.headers = Headers(headers)
def info(self):
return self.headers
class Scraper(object):
def __init__(self, user_agent=_user_agent,
cache_dir=None, headers={},
requests_per_minute=60,
follow_robots=True,
error_dir=None,
accept_cookies=True,
disable_compression=False,
use_cache_first=False,
raise_errors=True,
follow_redirects=True,
**kwargs):
"""
:param user_agent: the value to send as a User-Agent header on
HTTP requests
:param cache_dir: if not None, http caching will be enabled with
cached pages stored under the supplied path
:param requests_per_minute: maximum requests per minute (0 for
unlimited)
:param follow_robots: respect robots.txt files
:param error_dir: if not None,
:param accept_cookies: HTTP cookie support
:param disable_compression: do not accept compressed content
:param use_cache_first: always make an attempt to use cached data first
:param raise_errors: raise a HTTPError on 4xx or 5xx response
"""
self.user_agent = user_agent
self.headers = headers
self.follow_robots = follow_robots
self._robot_parsers = {}
self.requests_per_minute = requests_per_minute
if cache_dir and not USE_HTTPLIB2:
_log.warning("httplib2 not available, HTTP caching "
"and compression will be disabled.")
self.error_dir = error_dir
if self.error_dir:
try:
os.makedirs(error_dir)
except OSError, e:
if e.errno != 17:
raise
self.save_errors = True
else:
self.save_errors = False
self.accept_cookies = accept_cookies
self._cookie_jar = cookielib.CookieJar()
self.disable_compression = disable_compression
self.use_cache_first = use_cache_first
self.raise_errors = raise_errors
if USE_HTTPLIB2:
self._http = httplib2.Http(cache_dir)
else:
self._http = None
self.follow_redirects = follow_redirects
def _throttle(self):
now = time.time()
diff = self._request_frequency - (now - self._last_request)
if diff > 0:
_log.debug("sleeping for %fs" % diff)
time.sleep(diff)
self._last_request = time.time()
else:
self._last_request = now
def _robot_allowed(self, user_agent, parsed_url):
_log.info("checking robots permission for %s" % parsed_url.geturl())
robots_url = urlparse.urljoin(parsed_url.scheme + "://" +
parsed_url.netloc, "robots.txt")
try:
parser = self._robot_parsers[robots_url]
_log.info("using cached copy of %s" % robots_url)
except KeyError:
_log.info("grabbing %s" % robots_url)
parser = robotparser.RobotFileParser()
parser.set_url(robots_url)
parser.read()
self._robot_parsers[robots_url] = parser
return parser.can_fetch(user_agent, parsed_url.geturl())
def _make_headers(self, url):
if callable(self.headers):
headers = self.headers(url)
else:
headers = self.headers
if self.accept_cookies:
# CookieJar expects a urllib2.Request-like object
req = urllib2.Request(url, headers=headers)
self._cookie_jar.add_cookie_header(req)
headers = req.headers
headers.update(req.unredirected_hdrs)
headers = Headers(headers)
if 'User-Agent' not in headers:
headers['User-Agent'] = self.user_agent
if self.disable_compression and 'Accept-Encoding' not in headers:
headers['Accept-Encoding'] = 'text/*'
return headers
def _wrap_result(self, response, body):
if self.raise_errors and response.code >= 400:
raise HTTPError(response, body)
if isinstance(body, unicode):
return ResultUnicode(self, response, body)
if isinstance(body, str):
return ResultStr(self, response, body)
raise ValueError('expected body string')
@property
def follow_redirects(self):
if self._http:
return self._http.follow_redirects
return False
@follow_redirects.setter
def follow_redirects(self, value):
if self._http:
self._http.follow_redirects = value
@property
def requests_per_minute(self):
return self._requests_per_minute
@requests_per_minute.setter
def requests_per_minute(self, value):
if value > 0:
self._throttled = True
self._requests_per_minute = value
self._request_frequency = 60.0 / value
self._last_request = 0
else:
self._throttled = False
self._requests_per_minute = 0
self._request_frequency = 0.0
self._last_request = 0
def urlopen(self, url, method='GET', body=None):
if self._throttled:
self._throttle()
method = method.upper()
if method == 'POST' and body is None:
body = ''
# Default to HTTP requests
if not "://" in url:
_log.warning("no URL scheme provided, assuming HTTP")
url = "http://" + url
parsed_url = urlparse.urlparse(url)
headers = self._make_headers(url)
user_agent = headers['User-Agent']
if parsed_url.scheme in ['http', 'https']:
if self.follow_robots and not self._robot_allowed(user_agent,
parsed_url):
raise RobotExclusionError(
"User-Agent '%s' not allowed at '%s'" % (
user_agent, url), url, user_agent)
if USE_HTTPLIB2:
_log.info("getting %s using HTTPLIB2" % url)
if method == 'POST' and 'Content-Type' not in headers:
headers['Content-Type'] = ('application/'
'x-www-form-urlencoded')
# tell httplib2 not to make a request
if self.use_cache_first and 'Cache-Control' not in headers:
headers['cache-control'] = 'only-if-cached'
resp, content = self._http.request(url, method,
body=body,
headers=headers)
# do another request if there's no copy in local cache
if self.use_cache_first and resp.status == 504:
headers.pop('cache-control')
resp, content = self._http.request(url, method,
body=body,
headers=headers)
our_resp = Response(resp.get('content-location') or url,
url,
code=resp.status,
fromcache=resp.fromcache,
protocol=parsed_url.scheme,
headers=resp)
# important to accept cookies before redirect handling
if self.accept_cookies:
fake_req = urllib2.Request(url, headers=headers)
self._cookie_jar.extract_cookies(our_resp, fake_req)
# needed because httplib2 follows the HTTP spec a bit *too*
# closely and won't issue a GET following a POST (incorrect
# but expected and often seen behavior)
if (resp.status in (301, 302, 303, 307) and
self.follow_redirects):
if resp['location'].startswith('http'):
redirect = resp['location']
else:
redirect = urlparse.urljoin(parsed_url.scheme +
"://" +
parsed_url.netloc +
parsed_url.path,
resp['location'])
_log.debug('redirecting to %s' % redirect)
resp = self.urlopen(redirect)
resp.response.requested_url = url
return resp
return self._wrap_result(our_resp, content)
else:
# not an HTTP(S) request
if method != 'GET':
raise HTTPMethodUnavailableError(
"non-HTTP(S) requests do not support method '%s'" %
method, method)
if method not in ['GET', 'POST']:
raise HTTPMethodUnavailableError(
"urllib2 does not support '%s' method" % method, method)
_log.info("getting %s using urllib2" % url)
req = urllib2.Request(url, data=body, headers=headers)
if self.accept_cookies:
self._cookie_jar.add_cookie_header(req)
resp = urllib2.urlopen(req)
if self.accept_cookies:
self._cookie_jar.extract_cookies(resp, req)
our_resp = Response(resp.geturl(), url, code=resp.code,
fromcache=False, protocol=parsed_url.scheme,
headers=resp.headers)
return self._wrap_result(our_resp, resp.read())
def urlretrieve(self, url, filename=None, method='GET', body=None):
result = self.urlopen(url, method, body)
if not filename:
_, filename = tempfile.mkstemp()
f = open(filename, 'w')
f.write(result)
f.close()
return filename, result.response
def _save_error(self, url, body):
exception = sys.exc_info()[1]
out = {'exception': repr(exception),
'url': url,
'body': body,
'when': str(datetime.datetime.now())}
base_path = os.path.join(self.error_dir, url.replace('/', ','))
path = base_path
n = 0
while os.path.exists(path):
n += 1
path = base_path + "-%d" % n
with open(path, 'w') as fp:
json.dump(out, fp, ensure_ascii=False)