-
Notifications
You must be signed in to change notification settings - Fork 4
/
crawlLib.py
166 lines (129 loc) · 6.19 KB
/
crawlLib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pinterest
import sys,fcntl,csv,urllib2
import time,random
import os.path
import gzip
from datetime import datetime
import re
from datetime import *
class Crawler:
#------inicializacao -------#
def __init__(self, verbose=0):
self.pinterest = pinterest.Pinterest()
self.host = "localhost"
def findTime(self,timeAgo):
now = datetime.now()
time = -1
if(timeAgo.find("now") != -1):
time = now
else:
tempo = [int(s) for s in timeAgo.split() if s.isdigit()][0]
if(timeAgo.find("sec") != -1):
time = now - timedelta(seconds=tempo)
elif (timeAgo.find("minut") != -1):
time = now - timedelta(minutes=tempo)
elif(timeAgo.find("hour") != -1):
time = now - timedelta(hours=tempo)
return time
def validBoardName(self,name):
invalidNames = set (["likes","followers","following","boards","pins"])
if ( name in invalidNames ): return 0
return 1
def gatherInfo(self,pinterestID):
print "iniciei parse de " +pinterestID
path="profiles/"+pinterestID
if not os.path.exists(path): os.makedirs(path)
profile = gzip.open(path+"/profile","w")
html = self.pinterest.fetch("http://www.pinterest.com/"+ pinterestID )
print "voltou a resposta"
if (html == 1):
print "voltando pq deu erro - usuario nao existe"
return
profile.write(html)
profile.close()
print "Coletando attributes"
nPins = re.search('name="pinterestapp:pins" content="(.*)" ',html).group(1).strip()
nBoards = re.search('name="pinterestapp:boards" content="(.*)" ',html).group(1).strip()
nFollowing= re.search('name="pinterestapp:following" content="(.*)" ',html).group(1).strip()
nFollower= re.search('name="pinterestapp:followers" content="(.*)" ',html).group(1).strip()
#escreve atributos
atributos = open(path+"/attributes","w")
header="nBoards;nPins;nFollower;nFollowing\n"
att=""+nBoards+";"+nPins+";"+nFollower+";"+nFollowing
atributos.write(header+att)
atributos.close()
print "Coletando Boards"
#Salva os boards ----- Primeira pagina so vem os 49! nao 50!!!! OMG
pathBoards="profiles/"+pinterestID+"/boards"
if not os.path.exists(pathBoards): os.makedirs(pathBoards)
boards = re.findall('<a href="(.*)" class="boardLinkWrapper">',html)
for board in boards:
print board
albumLink = board
owner = albumLink.split("/")[1]
albumName = albumLink.split("/")[2]
if not os.path.exists(pathBoards+"/"+albumName): os.makedirs(pathBoards+"/"+albumName)
print "http://www.pinterest.com"+ albumLink
#check if it is a valid board name
if (self.validBoardName(albumName)):
#cralw the first pin page of the boad (25 items at most)
htmlBoard = self.pinterest.fetchPins("http://www.pinterest.com"+ albumLink, "0")
# check if it is a shared board
if not (re.search('class="inline BoardCollaborators Module"',htmlBoard)):
nPinsOnBoard = re.search('name="pinterestapp:pins" content="(.*)" ',htmlBoard).group(1).strip()
title = re.search('name="og:title" content="(.*)" ',htmlBoard).group(1).strip()
nFollowersBoard= re.search('name="followers" content="(.*)" ',htmlBoard).group(1).strip()
category= re.search('name="pinterestapp:category" content="(.*)" ',htmlBoard).group(1).strip()
pinsRead = set ()
coleta =1
info = open(pathBoards+"/"+albumName+"/timeline","a")
print nPinsOnBoard
#write metainfo of the board
saida = open(pathBoards+"/"+albumName+"/attributes","w")
header="title;category;nPins;nFollower;boardLink\n"
att=""+title.replace(";","")+";"+ category+ ";"+nPinsOnBoard+";"+nFollowersBoard+";"+albumLink
saida.write(header+att)
saida.close()
#write the first page of the board
saida = gzip.open(pathBoards+"/"+albumName+"/firstPage","w")
saida.write(htmlBoard)
saida.close()
#crawl until find some content not generated today
while(coleta):
#mudou html denovo
#pins = re.findall('<a href="(.*)" class="pinImageWrapper "',htmlBoard)
pins = re.findall('<a href="(.*)" class="pinImageWrapper',htmlBoard)
for pin in pins:
if not (pin in pinsRead):
htmlPin = self.pinterest.fetchSimple("http://www.pinterest.com"+ pin)
timeAgo = re.search('class="commentDescriptionTimeAgo">(.*)</span>',htmlPin)
timeCreate= self.findTime(timeAgo.group(1))
if ( timeCreate == -1):
coleta = 0
break
pinsRead.add(pin)
print len(pinsRead)
#save the pin-html
pinStream = gzip.open(pathBoards+"/"+albumName+"/"+pin.split("/")[2],"w")
pinStream.write(htmlPin)
pinStream.close()
#add the meta info telling when the content was created
info.write(pin.split("/")[2] +";" + str(timeCreate) + ";" + str(datetime.now())+"\n")
if (coleta != 0):
if ( (int(nPinsOnBoard) > len(pinsRead)) and (len(pinsRead)%25 ==0)):
remaning = int(nPinsOnBoard) - len(pinsRead)
if (remaning >= 25 ):
nRequest = len(pinsRead) + 25
else:
nRequest = len(pinsRead) + remaning
print "Pedindo mais " + str(nRequest)
htmlBoard = self.pinterest.fetchPins("http://www.pinterest.com"+ albumLink, str(nRequest) )
else:
break
info.close()
# #talvez tirar esse if, pq nao precisa ser dono do album
# if (albumLink.split("/")[1] == pinterestID):
# qtdPaginas = int(nPins)/50
# j = 0
# parada = 0
return 0