forked from considine/email-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
73 lines (58 loc) · 1.74 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
var rp = require("request-promise");
var HtmlParser = require("./html-parser-utils.js");
var unique = require("array-unique");
var validator = require("validator");
function WebsiteEmailScraper (domain) {
var websiteQueue = [
domain
];
var pastCrawled = [];
var emails = [];
let currentLevel = 1;
let timeout = 5000;
this.getLevels = async function(maxLevel, newTimeout) {
if (maxLevel === 0) return [];
let stop = false;
timeout = newTimeout || timeout;
let execTimeout = setTimeout(() => {
stop = true;
}, timeout);
try {
while(currentLevel <= maxLevel) {
if(stop) break;
await getLevel();
currentLevel ++;
}
} catch (error) {
throw error;
}
clearTimeout(execTimeout);
return unique(emails).filter(email => validator.isEmail(email));
}
// async parallels requests
async function getLevel () {
var newLinks = [];
const promises = [];
for (let url of websiteQueue) {
if (pastCrawled.indexOf(url) === -1) {
promises.push(new Promise(async (resolve) => {
try {
const htmlString = await rp({url, headers : {'User-Agent' : 'request'}, timeout})
parser = new HtmlParser(htmlString, domain);
newLinks.push(...parser.extractLinks());
emails.push(...parser.extractEmails());
} catch (error) {}
pastCrawled.push(url);
resolve();
}));
}
}
await Promise.all(promises);
websiteQueue = [];
for (var i=0; i<newLinks.length; i++) {
if (pastCrawled.indexOf(newLinks[i]) === -1) websiteQueue.push(newLinks[i]);
}
websiteQueue = unique(websiteQueue);
}
}
module.exports = WebsiteEmailScraper;