-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
86 lines (72 loc) · 2.06 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#coding: utf-8
import sqlite3
import urllib
import re
import ssl
from urlparse import urljoin
from urlparse import urlparse
from bs4 import BeautifulSoup
import time
scontext = None
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Results
(id INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE, url TEXT UNIQUE, titre TEXT,
prix TEXT, zone TEXT, cp TEXT, pieces TEXT,
surface TEXT,chambre TEXT, metro1 TEXT, metro2 TEXT, metro3
TEXT, tel TEXT, error INTEGER, cordN REAL, cordE REAL )''')
conn.commit()
starturl = raw_input('Enter a url to crawl')
sval = raw_input('How many pages:')
many = int(sval)
page=1
count = 0
url = None
while many>0:
if page != 1:
url = starturl+'-'+str(page)
elif page == 1:
url = starturl
print 'Scaning:',url
page+=1
many = many - 1
try:
enter = urllib.urlopen(url)
html = enter.read()
if enter.getcode() != 200 :
print "Error on page: ",enter.getcode()
if 'text/html' != enter.info().gettype() :
print "Ignore non text/html page"
continue
#print '('+str(len(html))+')',
soup = BeautifulSoup(html,"lxml")
except KeyboardInterrupt:
print ''
print 'Program interrupted by user...'
break
except:
print "Unable to retrieve or parse page"
continue
tags = soup('a')
href0 = list()#去重前的list
for tag in tags:
try:
addr = 'http://www.pap.fr'+tag.get('href')
addr = re.findall('^http.*/annonce/.*location.+r[0-9]+',addr)
href0+=addr
except:
continue
hrefs = list(set(href0)) #去重
hrefs.sort(key = href0.index)
for href in hrefs:
try:
cur.execute('INSERT OR IGNORE INTO Results (url) VALUES ( ? )', ( href, ) )
count = count + 1
print href
except:continue
conn.commit()
if many>1:
print "Waiting...."
time.sleep(5)
print count,"urls have been retrieved!!"
cur.close()