-
Notifications
You must be signed in to change notification settings - Fork 0
/
aedCrawler.py
253 lines (237 loc) · 10.1 KB
/
aedCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import sys
import re
import json
import codecs
import urllib.parse
import urllib.request
from googleSheets import getAccessToken, sheetValues, moveLinkToComplete
from time import time, sleep
from multiprocessing import Process, Queue, freeze_support
from lxml import html
from lxml import _elementpath
from PyQt5 import QtCore, QtGui
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngine import *
from PyQt5.QtWebEngineWidgets import *
#python setup.py build
#use chcp 65001 in command prompt
# this regular expression matches phones number in the following formats:
# (123) 456-7890, (123) 456.7890, 123-456-7890, and 123.456.7890
phoneReg = re.compile('((?:\()?\d{3}(?:-|\.| |\) )\d{3}(?:-|\.)\d{4})') #the period is a wildcard! Escape dat junt
emailReg = re.compile('[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}')
linkReg = re.compile('(?P<url>https?://[\'\"\<>#%{}|\\\^\~\[\]`\s]+)')
forbiddenDomains = ["facebook.com", "instagram.com", "pinterest.com", "linkedin.com", "twitter.com", "google.com", 'itunes.apple.com', "youtube.com", "hammernutrition.com", 'wholefoodsmarket.com', 'myvega.com', 'lornajane.com']
forbiddinEmail = ['@illuminatistudios.com']
forbiddenExtensions = ['.png', '.jpg', '.gif', '.svg', 'aspx']
accessToken = {'token': '',
'expires': 0}
aeMarket = '1M5CLqzJO4QQgrOiGZKCINcYB1WE39ByoZ2Rr-8WGIXw'
timeToAbort = 30
waitForLoad = 0
app = QApplication(sys.argv)
#class fakeOut (object):
# def __init__(self, *args, **kwargs):
# pass
## def flush():
# pass
#def write(dummy):
# pass
#sys.stdout = fakeOut
#sys.stderr = fakeOut
class LinkScrubber(object):
def __init__(self, *args, **kwargs):
pass
def run(self, url, depth, q):
p = Process(target=self.render, args=(url, depth, q))
processStart = time()
p.start()
self.data = []
while p.is_alive():
timeElapsed = time() - processStart
if timeElapsed > timeToAbort:
print('Process has timed out. Aborting')
p.terminate()
app.quit() #maybe this isn't killing the same instance of Qapp as the thread, remember spawning a new process pretty much just starts the program again.
else:
if not q.empty():
self.data.append(q.get())
p.join()
def render(self, url, depth, q):
self.q = q
self.url = url
self.depth = depth
QWebEngineProfile.defaultProfile().setHttpUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36")
self.qt = QWebEnginePage()
#self.qt.loadStarted.connect(self._loadStarted)
#self.qt.loadProgress.connect(self._loadProgress)
self.qt.loadFinished.connect(self._loadFinished)
self.qt.load(QUrl(url))
app.exec_()
#def _loadStarted(self):
# print ('Load started.')
# def _loadProgress(self, progress):
# print ('Loading progress: ' + str(progress) + '%')
def _loadFinished(self, result):
#print ('Load finished.')
if self.depth == 0:
sleep(waitForLoad)
self.qt.toHtml(self.callable)
def callable(self, data):
self.html = data
data = html.fromstring(data.encode())
links = []
txtLinks = []
els = data.xpath('//a')
for el in els:
url = el.xpath("./@href") #the '.' indicates a local search
title = el.xpath("./text()")
if len(url) == 0:
continue
url[0] = urllib.parse.urljoin(self.url, url[0])
print(url[0])
links.append({'url': url[0], 'title': title[0] if len(title) > 0 else 'Link title not found'})
txtLinks = linkReg.findall(self.html) #catches urls in in html links
for link in txtLinks:
link = link.rstrip('\'"')
links.append({'url': link, 'title': 'Link found in text..'})
print('TEXT LINK ' + link)
emails = emailReg.findall(self.html) ##get email addresses
numbers = phoneReg.findall(self.html) ##get phone numbers
for link in links:
link['url'] = link['url'].strip() #remove trailing and leading spaces!!
self.q.put(['link', link])
for email in emails:
if not any(ext in email for ext in forbiddenExtensions):
self.q.put(['email', email])
for number in numbers:
self.q.put(['number', number])
app.quit()
class LinkCrawler(object):
def __init__(self, link, depth, depthLimit, q, **kwargs):
print(depth)
self.url = link['url']
self.title = link['title']
self.depth = depth
self.depthLimit = depthLimit
self.linkData = self.getLinkData()
self.allLinks = []
self.emails = []
self.numbers = []
self.sortData()
print ('Links found: ' + str(len(self.allLinks)))
print ('Email addresses found: ' + str(len(self.emails)))
for email in self.emails:
print(email)
self.validLinks = []
self.invalidLinks = []
self.verifyLinks()
#for link in self.validLinks:
# print(link['url'])
self.children = []
self.spawnChildren()
pass
def getLinkData(self):
l = LinkScrubber()
l.run(self.url, self.depth, q)
return l.data
def sortData(self):
def isLink(link):
if link['url'] not in allLinks or self.depth == 0:
self.allLinks.append(link)
allLinks.append(link['url'])
def isEmail(email):
if email not in allEmails:
self.emails.append(email)
allEmails.append(email)
def isNumber(number):
if number not in allNumbers:
self.numbers.append(number)
allNumbers.append(number)
dataType = {'link': isLink, 'email': isEmail, 'number': isNumber}
for datum in self.linkData:
dataType[datum[0]](datum[1])
def verifyLinks(self):
for link in self.allLinks:
if any(domain in link['url'] for domain in forbiddenDomains):
self.invalidLinks.append(link)
elif QUrl(link['url']).isValid():
self.validLinks.append(link)
else:
self.invalidLinks.append(link)
def spawnChildren(self):
if self.depth < self.depthLimit:
for link in self.validLinks:
if link['url'] not in linksVisited:
print('Visiting: ' + link['title'] + '\n' + link['url'] + '\n')
self.children.append(LinkCrawler(link, self.depth + 1, self.depthLimit, q))
else:
print('Depth limit reached.\n')
def getAllData(branch, key):
if len(branch.emails) > 0 or len(branch.numbers):
dataDict[key].append([branch.title, branch.url, branch.emails, branch.numbers])
#data = {
#timedOut: false,
#hasChildren: true,
#depth: 0,
#loadTime: int,
#totalTime: number,
#allLinks: [links from self and all children],
#allEmails: [email addresses from self and all children],
#emails: [emails from self],
#links: [immediate child links],
#"immediate child link 1": same structure as whole
#}
def traverse(branch, key):
for twig in branch.children:
traverse(twig, key)
getAllData(branch, key)
if __name__ == '__main__':
freeze_support() #necessary in order to properly freeze file
emailSource = '1'#'Links to Crawl'
linkDump = '2'#'Email Addresses Found'
linksVisited = ''
allLinks = [] #this is used to record a link only once if it appears severals times on a single page
allEmails = []
allNumbers = []
q = Queue(maxsize=0)
accessToken = getAccessToken(accessToken)
crawl = True
while crawl:
linkData = sheetValues(aeMarket, emailSource, 'A2:B', None, {}, None, 'GET', accessToken) #sheet values are returned as 2d array
if 'values' in linkData.keys():
linkArray = linkData['values']
link = {}
link['url'] = linkData['values'][0][0]
link['depthLimit'] = int(linkData['values'][0][1]) if len(linkData['values'][0]) == 2 else 2
link['title'] = 'Base'
else:
print('No links to crawl')
exit()
emails = []
dataDict = {}
print('Starting at: ' + link['url'])
linkTree = LinkCrawler(link, 0, link['depthLimit'], q)
dataDict[link['url']] = []
traverse(linkTree, link['url'])
link['data'] = dataDict
for key in dataDict:
for page in dataDict[key]:
emails.extend(page[2])
accessToken = getAccessToken(accessToken)
headers = {"Content-Type": "application/json"}
currentEmailData = sheetValues(aeMarket, linkDump, 'A2:A', None, {}, None, 'GET', accessToken)
if 'values' in currentEmailData.keys():
currentEmails = currentEmailData['values']
for email in currentEmails:
print(email[0])
emails.append(email[0])
for email in emails:
print(email)
else:
print('No email addresses in list')
body = {'values': [emails], 'majorDimension': 'COLUMNS'}
response = sheetValues(aeMarket, linkDump, 'A2:A' + str(len(emails) + 1), body, headers, {'valueInputOption': 'RAW'}, 'PUT', accessToken)
print(response)
moveLinkToComplete(link, aeMarket, emailSource, accessToken)