-
Notifications
You must be signed in to change notification settings - Fork 49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Selenium grid support #89
base: main
Are you sure you want to change the base?
Changes from 12 commits
6ef465e
a732a5e
16ada25
312a942
07d8540
8691b67
00216f1
64d4ea4
79742bc
0654f73
2e58df5
b766f8b
5d9f50c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
const path = require('path'); | ||
const fs = require('fs'); | ||
const chalk = require('chalk').default; | ||
const asyncLib = require('async'); | ||
const runCrawlers = require('../crawlerConductor'); | ||
const program = require('commander'); | ||
const URL = require('url').URL; | ||
|
@@ -34,8 +35,65 @@ program | |
.option('--config <path>', 'crawl configuration file') | ||
.option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optout, optin') | ||
.option('--chromium-version <version_number>', 'use custom version of chromium') | ||
.option('--selenium-hub <url>', 'selenium hub endpoint to request browsers from') | ||
.parse(process.argv); | ||
|
||
/** | ||
* @param {string} outputPath | ||
* @param {URL} url | ||
* @param {string} fileType file extension, defaults to 'json' | ||
*/ | ||
function createOutputPath(outputPath, url, fileType = 'json') { | ||
return path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`); | ||
} | ||
|
||
/** | ||
* @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls | ||
* @param {function} logFunction | ||
* @param {string} outputPath | ||
*/ | ||
function filterUrls(inputUrls, logFunction, outputPath) { | ||
return new Promise((resolveFilterUrls, rejectFilterUrls) => { | ||
asyncLib.filter(inputUrls, (item, filterCallback) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is performance here worth the the complexity of doing filtering async? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't do a detailed profiling, but with hundreds of thousands URLs, this takes quite a lot of time (tens of seconds). Depending on the FS speed it could make a difference I think. |
||
const urlString = (typeof item === 'string') ? item : item.url; | ||
|
||
/** | ||
* @type {URL} | ||
*/ | ||
let url; | ||
|
||
try { | ||
url = new URL(urlString); | ||
} catch (e) { | ||
logFunction(chalk.yellow('Invalid URL:'), urlString); | ||
filterCallback(null, false); | ||
return; | ||
} | ||
|
||
if (outputPath) { | ||
// filter out entries for which result file already exists | ||
const outputFile = createOutputPath(outputPath, url); | ||
fs.access(outputFile, err => { | ||
if (err) { | ||
filterCallback(null, true); | ||
} else { | ||
logFunction(chalk.yellow(`Skipping "${urlString}" because output file already exists.`)); | ||
filterCallback(null, false); | ||
} | ||
}); | ||
return; | ||
} | ||
filterCallback(null, true); | ||
}, (err, results) => { | ||
if (err) { | ||
logFunction(chalk.red(`Could not filter URL list: ${err}`)); | ||
rejectFilterUrls(err); | ||
} | ||
resolveFilterUrls(results); | ||
}); | ||
}); | ||
} | ||
|
||
/** | ||
* @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls | ||
* @param {string} outputPath | ||
|
@@ -54,8 +112,9 @@ program | |
* @param {number} maxLoadTimeMs | ||
* @param {number} extraExecutionTimeMs | ||
* @param {Object.<string, boolean>} collectorFlags | ||
* @param {string} seleniumHub | ||
*/ | ||
async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) { | ||
async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, seleniumHub) { | ||
const startTime = new Date(); | ||
|
||
reporters.forEach(reporter => { | ||
|
@@ -71,39 +130,8 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da | |
}); | ||
}; | ||
|
||
/** | ||
* @type {function(...any):string} | ||
* @param {URL} url | ||
* @param {string} fileType file extension, defaults to 'json' | ||
*/ | ||
const createOutputPath = ((url, fileType = 'json') => path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`)); | ||
|
||
const urls = inputUrls.filter(item => { | ||
const urlString = (typeof item === 'string') ? item : item.url; | ||
|
||
/** | ||
* @type {URL} | ||
*/ | ||
let url; | ||
|
||
try { | ||
url = new URL(urlString); | ||
} catch(e) { | ||
log(chalk.yellow('Invalid URL:'), urlString); | ||
return false; | ||
} | ||
|
||
if (forceOverwrite !== true) { | ||
// filter out entries for which result file already exists | ||
const outputFile = createOutputPath(url); | ||
if (fs.existsSync(outputFile)) { | ||
log(chalk.yellow(`Skipping "${urlString}" because output file already exists.`)); | ||
return false; | ||
} | ||
} | ||
|
||
return true; | ||
}); | ||
const urls = await filterUrls(inputUrls, log, forceOverwrite === true ? null : outputPath); | ||
log(chalk.yellow(`Skipped ${inputUrls.length - urls.length} URLs`)); | ||
|
||
const urlsLength = urls.length; | ||
let failures = 0; | ||
|
@@ -135,11 +163,11 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da | |
|
||
crawlTimes.push([data.testStarted, data.testFinished, data.testFinished - data.testStarted]); | ||
|
||
const outputFile = createOutputPath(url); | ||
const outputFile = createOutputPath(outputPath, url); | ||
|
||
// move screenshot to its own file and only keep screenshot path in the JSON data | ||
if (data.data.screenshots) { | ||
const screenshotFilename = createOutputPath(url, 'jpg'); | ||
const screenshotFilename = createOutputPath(outputPath, url, 'jpg'); | ||
fs.writeFileSync(screenshotFilename, Buffer.from(data.data.screenshots, 'base64')); | ||
|
||
data.data.screenshots = screenshotFilename; | ||
|
@@ -174,6 +202,7 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da | |
maxLoadTimeMs, | ||
extraExecutionTimeMs, | ||
collectorFlags, | ||
seleniumHub, | ||
}); | ||
log(chalk.green('\n✅ Finished successfully.')); | ||
} catch(e) { | ||
|
@@ -257,5 +286,5 @@ if (!config.urls || !config.output) { | |
return item; | ||
}); | ||
|
||
run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags); | ||
run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags, config.seleniumHub); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice, I like that you extracted this! |
||
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 10; Pixel 2 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36'; | ||
|
||
const DEFAULT_VIEWPORT = { | ||
width: 1440,//px | ||
height: 812//px | ||
}; | ||
const MOBILE_VIEWPORT = { | ||
width: 412, | ||
height: 691, | ||
deviceScaleFactor: 2, | ||
isMobile: true, | ||
hasTouch: true | ||
}; | ||
|
||
// for debugging: will lunch in window mode instad of headless, open devtools and don't close windows after process finishes | ||
const VISUAL_DEBUG = false; | ||
|
||
module.exports = { | ||
DEFAULT_USER_AGENT, | ||
MOBILE_USER_AGENT, | ||
DEFAULT_VIEWPORT, | ||
MOBILE_VIEWPORT, | ||
VISUAL_DEBUG, | ||
}; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,58 +10,91 @@ const downloadCustomChromium = require('./helpers/downloadCustomChromium'); | |
// eslint-disable-next-line no-unused-vars | ||
const BaseCollector = require('./collectors/BaseCollector'); | ||
const notABot = require('./helpers/notABot'); | ||
const {getRemoteDriver, getPuppeteerContext} = require('./helpers/selenium'); | ||
const {VISUAL_DEBUG} = require('./constants'); | ||
|
||
const MAX_NUMBER_OF_CRAWLERS = 38;// by trial and error there seems to be network bandwidth issues with more than 38 browsers. | ||
const MAX_NUMBER_OF_RETRIES = 2; | ||
|
||
/** | ||
* @param {string} urlString | ||
* @param {string} urlString | ||
* @param {BaseCollector[]} dataCollectors | ||
* @param {function} log | ||
* @param {function} log | ||
* @param {boolean} filterOutFirstParty | ||
* @param {function(URL, import('./crawler').CollectResult): void} dataCallback | ||
* @param {function(URL, import('./crawler').CollectResult): void} dataCallback | ||
* @param {boolean} emulateMobile | ||
* @param {string} proxyHost | ||
* @param {boolean} antiBotDetection | ||
* @param {string} executablePath | ||
* @param {number} maxLoadTimeMs | ||
* @param {number} extraExecutionTimeMs | ||
* @param {Object.<string, string>} collectorFlags | ||
* @param {string} chromiumVersion | ||
* @param {string} seleniumHub | ||
*/ | ||
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) { | ||
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, chromiumVersion, seleniumHub) { | ||
const url = new URL(urlString); | ||
/** | ||
* @type {function(...any):void} | ||
* @type {function(...any):void} | ||
*/ | ||
const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg); | ||
|
||
const data = await crawl(url, { | ||
log: prefixedLog, | ||
// @ts-ignore | ||
collectors: dataCollectors.map(collector => new collector.constructor()), | ||
filterOutFirstParty, | ||
emulateMobile, | ||
proxyHost, | ||
runInEveryFrame: antiBotDetection ? notABot : undefined, | ||
executablePath, | ||
maxLoadTimeMs, | ||
extraExecutionTimeMs, | ||
collectorFlags, | ||
}); | ||
let browserContext = null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this selenium stuff doesn't feel right in here, this function is not about setting up a browser, but rather doing a single site crawl and dumping the data. I'd rather do it in the crawler.js/openBrowser - that's where we are setting up the browser. Additionally, it will help with some of the repetition (e.g. flags - '--enable-blink-features=InterestCohortAPI',) |
||
let driver = null; | ||
if (seleniumHub) { | ||
try { | ||
prefixedLog(`Getting remote browser...`); | ||
driver = await getRemoteDriver({seleniumHub, chromiumVersion, proxyHost}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is |
||
prefixedLog(`Got remote browser ${driver}`); | ||
browserContext = await getPuppeteerContext(seleniumHub, driver); | ||
} catch (e) { | ||
prefixedLog(chalk.red(`Could not get a remote browser`), chalk.gray(e.message)); | ||
throw e; | ||
} | ||
} | ||
|
||
try { | ||
const data = await crawl(url, { | ||
log: prefixedLog, | ||
// @ts-ignore | ||
collectors: dataCollectors.map(collector => new collector.constructor()), | ||
filterOutFirstParty, | ||
emulateMobile, | ||
proxyHost, | ||
runInEveryFrame: antiBotDetection ? notABot : undefined, | ||
executablePath, | ||
maxLoadTimeMs, | ||
extraExecutionTimeMs, | ||
collectorFlags, | ||
browserContext, | ||
}); | ||
|
||
dataCallback(url, data); | ||
dataCallback(url, data); | ||
} finally { | ||
if (driver && !VISUAL_DEBUG) { | ||
try { | ||
await browserContext.browser().disconnect(); | ||
await driver.quit(); | ||
} catch (e) { | ||
prefixedLog(chalk.red(`Could not clean up remote browser`), chalk.gray(e.message)); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>}} options | ||
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>, seleniumHub?: string}} options | ||
*/ | ||
module.exports = async options => { | ||
const deferred = createDeferred(); | ||
const log = options.logFunction || (() => {}); | ||
const failureCallback = options.failureCallback || (() => {}); | ||
|
||
let numberOfCrawlers = options.numberOfCrawlers || Math.floor(cores * 0.8); | ||
numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers, options.urls.length); | ||
numberOfCrawlers = Math.min(numberOfCrawlers, options.urls.length); | ||
if (!options.seleniumHub) { | ||
numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers); | ||
} | ||
|
||
// Increase number of listeners so we have at least one listener for each async process | ||
if (numberOfCrawlers > process.getMaxListeners()) { | ||
|
@@ -73,7 +106,7 @@ module.exports = async options => { | |
* @type {string} | ||
*/ | ||
let executablePath; | ||
if (options.chromiumVersion) { | ||
if (options.chromiumVersion && !options.seleniumHub) { | ||
executablePath = await downloadCustomChromium(log, options.chromiumVersion); | ||
} | ||
|
||
|
@@ -89,7 +122,7 @@ module.exports = async options => { | |
log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`)); | ||
const timer = createTimer(); | ||
|
||
const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags); | ||
const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags, options.chromiumVersion, options.seleniumHub); | ||
|
||
async.retry(MAX_NUMBER_OF_RETRIES, task, err => { | ||
if (err) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please also add it to other examples in the readme (below)