forked from tongzhou80/Web-crawlers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
douban_crawler.py
113 lines (106 loc) · 3.38 KB
/
douban_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#coding=utf-8
#tip:
#1.target site:http://www.douban.com,when sign in you need to input a Captcha
#2.parse:BeautifulSoup,which is awesome
#3.result:stored in "data.txt"
import sys
import urllib2
import urllib
import cookielib
from BeautifulSoup import BeautifulSoup
import StringIO
from PIL import Image
#-------------------------------------------------------------------log in part
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
headers ={'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36'}
url = 'https://www.douban.com/accounts/login'
logginPage=''
keepRequest=1
while keepRequest==1:
try:
req=urllib2.Request(url,headers=headers)
logginPage=urllib2.urlopen(req,timeout=1).read()
except:
print 'request again'
else:
keepRequest=0
soup = BeautifulSoup(logginPage)
imgUrl=soup.find(attrs={'id':'captcha_image'})['src']
captcha_id=soup.find(attrs={'name':'captcha-id'})['value']
buffer=urllib2.urlopen(imgUrl).read()
im=Image.open(StringIO.StringIO(buffer))
im.show()
captcha_solution= raw_input("Captcha is:")
data={'source':'simple',
'redir':'http://www.douban.com/people/77250418/contacts',
"form_email":your email,
'form_password':your password,
'captcha-solution':captcha_solution,
'captcha-id':captcha_id,
'user_login':'登录'
}
post_data=urllib.urlencode(data)
keepRequest=1
while keepRequest==1:
try:
req=urllib2.Request(url,post_data,headers)
confirmUrl=urllib2.urlopen(req,timeout=1).geturl()
except:
print 'request again'
else:
keepRequest=0
if confirmUrl=='http://www.douban.com/people/77250418/contacts':
print 'sign in success'
else:
print 'shit! Please run again'
#-------------------------------------------------------------------mining part
show=file('data.txt','w')
userList = ['77250418']
currentUrl='http://www.douban.com/people/77250418/contacts'
userCnt=0
searchCnt=0
maxUserNum=3000
while searchCnt<maxUserNum:
currentUrl=currentUrl[0:29]
#print currentUrl
currentUser=userList.pop(0)
currentUrl+=currentUser+'/contacts'
keepRequest=1
UserPage=''
while keepRequest==1:
try:
req=urllib2.Request(currentUrl,headers=headers)
UserPage = urllib2.urlopen(req,timeout=1).read()
except:
print 'request again'
else:
keepRequest=0
#print UserPage
soup = BeautifulSoup(UserPage)
contactList=soup.findAll('dt')
contactId=''
#print len(contactList)
for i in range(0,len(contactList)):
currentContact=contactList[i]
contactLink=currentContact.contents[0].contents[0]['src']
contactLink=contactLink[29:len(contactLink)]
#print contactLink
for letter in contactLink:
if letter=='-':
break
if letter>='0' and letter<='9':
contactId+=letter
#print contactId
if contactId=='':
continue
else:
show.write(currentUser+' '+contactId+'\n')
if userCnt<maxUserNum:
userList.append(contactId)
userCnt+=1
contactId=''
show.flush()
searchCnt+=1
print 'In all,',searchCnt,'users\'contacts have been scrapped'