-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphantomjs-example.js
125 lines (101 loc) · 2.92 KB
/
phantomjs-example.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/**
* Created by Oliver on 24.02.2016.
*/
var phantomAPI = require("phantom"),
Crawler = require("simplecrawler"),
colors = require("colors/safe"),
phantomjs = require("phantomjs");
var crawler = new Crawler("www.example.com", "/", 80, 0),
phantomBin = phantomjs.path,
phantomBannedExtensions = /\.(png|jpg|jpeg|gif|ico|css|js|csv|doc|docx|pdf)$/i,
phantomQueue = [];
phantomAPI.create({ binary: phantomBin }, runCrawler);
// Events which end up being a bit noisy
var boringEvents = [
"queueduplicate",
"fetchstart",
"discoverycomplete"
];
// Replace original emit so we can sample all events easily
// and log them to console
var originalEmit = crawler.emit;
crawler.emit = function(name, queueItem) {
var url = "";
if (queueItem) {
if (typeof queueItem === "string") {
url = queueItem;
} else if (queueItem.url) {
url = queueItem.url;
}
}
function pad(string) {
while (string.length < 20) {
string += " ";
}
return string;
}
if (boringEvents.indexOf(name) === -1) {
console.log(colors.cyan("%s") + "%s", pad(name), url);
}
originalEmit.apply(crawler, arguments);
};
crawler.on("complete", process.exit.bind(process, 0));
function runCrawler(phantom) {
crawler.start();
crawler.on("queueadd", function(queueItem) {
if (!queueItem.url.match(phantomBannedExtensions)) {
var resume = this.wait();
phantomQueue.push(queueItem.url);
processQueue(phantom, resume);
}
});
}
function getLinks(phantom, url, callback) {
console.log(colors.green("Phantom attempting to load ") + colors.cyan("%s"), url);
makePage(phantom, url, function(page, status) {
console.log(
colors.green("Phantom opened URL with %s — ") + colors.cyan("%s"), status, url);
page.evaluate(findPageLinks, function(result) {
result.forEach(function(url) {
crawler.queueURL(url);
});
callback();
});
});
}
function findPageLinks() {
var selector = document.querySelectorAll("a, link, img");
selector = [].slice.call(selector);
return selector
.map(function(link) {
return link.href || link.onclick || link.href || link.src;
})
.filter(function(src) {
return !!src;
});
}
function makePage(phantom, url, callback) {
phantom.createPage(function(page) {
page.open(url, function(status) {
callback(page, status);
});
});
}
var queueBeingProcessed = false;
function processQueue(phantom, resume) {
if (queueBeingProcessed) {
return;
}
queueBeingProcessed = true;
(function processor(item) {
if (!item) {
console.log(colors.green("Phantom reached end of queue! ------------"));
queueBeingProcessed = false;
return resume();
}
getLinks(phantom, item, function() {
// Break up stack so we don't blow it
setTimeout(processor.bind(null, phantomQueue.shift()), 10);
});
})(phantomQueue.shift());
}