Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Selenium grid support #89

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Available options:
- `-r, --region-code <region>` - optional 2 letter region code. For metadata only
- `-a, --disable-anti-bot` - disable simple build-in anti bot detection script injected to every frame
- `--chromium-version <version_number>` - use custom version of Chromium (e.g. "843427") instead of using the default
- `--selenium-hub <url>` - instead of running a local browser, request a remote browser at the provided Selenium hub endpoint
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please also add it to other examples in the readme (below)

- `--config <path>` - path to a config file that allows to set all the above settings (and more). Note that CLI flags have a higher priority than settings passed via config. You can find a sample config file in `tests/cli/sampleConfig.json`.
- `--autoconsent-action <action>` - automatic autoconsent action (requires the `cmps` collector). Possible values: optIn, optOut

Expand Down
103 changes: 66 additions & 37 deletions cli/crawl-cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
const path = require('path');
const fs = require('fs');
const chalk = require('chalk').default;
const asyncLib = require('async');
const runCrawlers = require('../crawlerConductor');
const program = require('commander');
const URL = require('url').URL;
Expand Down Expand Up @@ -34,8 +35,65 @@ program
.option('--config <path>', 'crawl configuration file')
.option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optout, optin')
.option('--chromium-version <version_number>', 'use custom version of chromium')
.option('--selenium-hub <url>', 'selenium hub endpoint to request browsers from')
.parse(process.argv);

/**
* @param {string} outputPath
* @param {URL} url
* @param {string} fileType file extension, defaults to 'json'
*/
function createOutputPath(outputPath, url, fileType = 'json') {
return path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`);
}

/**
* @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls
* @param {function} logFunction
* @param {string} outputPath
*/
function filterUrls(inputUrls, logFunction, outputPath) {
return new Promise((resolveFilterUrls, rejectFilterUrls) => {
asyncLib.filter(inputUrls, (item, filterCallback) => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is performance here worth the the complexity of doing filtering async?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't do a detailed profiling, but with hundreds of thousands URLs, this takes quite a lot of time (tens of seconds). Depending on the FS speed it could make a difference I think.
The latest async lib supports promises, so if/when we upgrade it it should look less ugly.

const urlString = (typeof item === 'string') ? item : item.url;

/**
* @type {URL}
*/
let url;

try {
url = new URL(urlString);
} catch (e) {
logFunction(chalk.yellow('Invalid URL:'), urlString);
filterCallback(null, false);
return;
}

if (outputPath) {
// filter out entries for which result file already exists
const outputFile = createOutputPath(outputPath, url);
fs.access(outputFile, err => {
if (err) {
filterCallback(null, true);
} else {
logFunction(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
filterCallback(null, false);
}
});
return;
}
filterCallback(null, true);
}, (err, results) => {
if (err) {
logFunction(chalk.red(`Could not filter URL list: ${err}`));
rejectFilterUrls(err);
}
resolveFilterUrls(results);
});
});
}

/**
* @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls
* @param {string} outputPath
Expand All @@ -54,8 +112,9 @@ program
* @param {number} maxLoadTimeMs
* @param {number} extraExecutionTimeMs
* @param {Object.<string, boolean>} collectorFlags
* @param {string} seleniumHub
*/
async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, seleniumHub) {
const startTime = new Date();

reporters.forEach(reporter => {
Expand All @@ -71,39 +130,8 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
});
};

/**
* @type {function(...any):string}
* @param {URL} url
* @param {string} fileType file extension, defaults to 'json'
*/
const createOutputPath = ((url, fileType = 'json') => path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`));

const urls = inputUrls.filter(item => {
const urlString = (typeof item === 'string') ? item : item.url;

/**
* @type {URL}
*/
let url;

try {
url = new URL(urlString);
} catch(e) {
log(chalk.yellow('Invalid URL:'), urlString);
return false;
}

if (forceOverwrite !== true) {
// filter out entries for which result file already exists
const outputFile = createOutputPath(url);
if (fs.existsSync(outputFile)) {
log(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
return false;
}
}

return true;
});
const urls = await filterUrls(inputUrls, log, forceOverwrite === true ? null : outputPath);
log(chalk.yellow(`Skipped ${inputUrls.length - urls.length} URLs`));

const urlsLength = urls.length;
let failures = 0;
Expand Down Expand Up @@ -135,11 +163,11 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da

crawlTimes.push([data.testStarted, data.testFinished, data.testFinished - data.testStarted]);

const outputFile = createOutputPath(url);
const outputFile = createOutputPath(outputPath, url);

// move screenshot to its own file and only keep screenshot path in the JSON data
if (data.data.screenshots) {
const screenshotFilename = createOutputPath(url, 'jpg');
const screenshotFilename = createOutputPath(outputPath, url, 'jpg');
fs.writeFileSync(screenshotFilename, Buffer.from(data.data.screenshots, 'base64'));

data.data.screenshots = screenshotFilename;
Expand Down Expand Up @@ -174,6 +202,7 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
seleniumHub,
});
log(chalk.green('\n✅ Finished successfully.'));
} catch(e) {
Expand Down Expand Up @@ -257,5 +286,5 @@ if (!config.urls || !config.output) {
return item;
});

run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags);
run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags, config.seleniumHub);
}
6 changes: 5 additions & 1 deletion cli/crawlConfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function addProtocolIfNeeded(url) {
/**
* Looks at CLI flags, JSON config etc. to figure out the final crawl config
*
* @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags
* @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, seleniumHub?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags
* @returns {CrawlConfig}
*/
function figureOut(flags) {
Expand Down Expand Up @@ -65,6 +65,9 @@ function figureOut(flags) {
if (flags.chromiumVersion) {
crawlConfig.chromiumVersion = flags.chromiumVersion;
}
if (flags.seleniumHub) {
crawlConfig.seleniumHub = flags.seleniumHub;
}

// array settings
if (flags.dataCollectors) {
Expand Down Expand Up @@ -130,6 +133,7 @@ module.exports = {
* @property {string} proxyConfig
* @property {string} regionCode
* @property {string} chromiumVersion
* @property {string} seleniumHub
* @property {boolean} filterOutFirstParty
* @property {boolean} forceOverwrite
* @property {boolean} verbose
Expand Down
25 changes: 25 additions & 0 deletions constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36';
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, I like that you extracted this!

const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 10; Pixel 2 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36';

const DEFAULT_VIEWPORT = {
width: 1440,//px
height: 812//px
};
const MOBILE_VIEWPORT = {
width: 412,
height: 691,
deviceScaleFactor: 2,
isMobile: true,
hasTouch: true
};

// for debugging: will lunch in window mode instad of headless, open devtools and don't close windows after process finishes
const VISUAL_DEBUG = false;

module.exports = {
DEFAULT_USER_AGENT,
MOBILE_USER_AGENT,
DEFAULT_VIEWPORT,
MOBILE_VIEWPORT,
VISUAL_DEBUG,
};
19 changes: 1 addition & 18 deletions crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,7 @@ const chalk = require('chalk').default;
const {createTimer} = require('./helpers/timer');
const wait = require('./helpers/wait');
const tldts = require('tldts');

const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36';
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 10; Pixel 2 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36';

const DEFAULT_VIEWPORT = {
width: 1440,//px
height: 812//px
};
const MOBILE_VIEWPORT = {
width: 412,
height: 691,
deviceScaleFactor: 2,
isMobile: true,
hasTouch: true
};

// for debugging: will lunch in window mode instad of headless, open devtools and don't close windows after process finishes
const VISUAL_DEBUG = false;
const {DEFAULT_USER_AGENT, MOBILE_USER_AGENT, DEFAULT_VIEWPORT, MOBILE_VIEWPORT, VISUAL_DEBUG} = require('./constants');

/**
* @param {function(...any):void} log
Expand Down
79 changes: 56 additions & 23 deletions crawlerConductor.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,58 +10,91 @@ const downloadCustomChromium = require('./helpers/downloadCustomChromium');
// eslint-disable-next-line no-unused-vars
const BaseCollector = require('./collectors/BaseCollector');
const notABot = require('./helpers/notABot');
const {getRemoteDriver, getPuppeteerContext} = require('./helpers/selenium');
const {VISUAL_DEBUG} = require('./constants');

const MAX_NUMBER_OF_CRAWLERS = 38;// by trial and error there seems to be network bandwidth issues with more than 38 browsers.
const MAX_NUMBER_OF_RETRIES = 2;

/**
* @param {string} urlString
* @param {string} urlString
* @param {BaseCollector[]} dataCollectors
* @param {function} log
* @param {function} log
* @param {boolean} filterOutFirstParty
* @param {function(URL, import('./crawler').CollectResult): void} dataCallback
* @param {function(URL, import('./crawler').CollectResult): void} dataCallback
* @param {boolean} emulateMobile
* @param {string} proxyHost
* @param {boolean} antiBotDetection
* @param {string} executablePath
* @param {number} maxLoadTimeMs
* @param {number} extraExecutionTimeMs
* @param {Object.<string, string>} collectorFlags
* @param {string} chromiumVersion
* @param {string} seleniumHub
*/
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, chromiumVersion, seleniumHub) {
const url = new URL(urlString);
/**
* @type {function(...any):void}
* @type {function(...any):void}
*/
const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg);

const data = await crawl(url, {
log: prefixedLog,
// @ts-ignore
collectors: dataCollectors.map(collector => new collector.constructor()),
filterOutFirstParty,
emulateMobile,
proxyHost,
runInEveryFrame: antiBotDetection ? notABot : undefined,
executablePath,
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
});
let browserContext = null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this selenium stuff doesn't feel right in here, this function is not about setting up a browser, but rather doing a single site crawl and dumping the data. I'd rather do it in the crawler.js/openBrowser - that's where we are setting up the browser. Additionally, it will help with some of the repetition (e.g. flags - '--enable-blink-features=InterestCohortAPI',)

let driver = null;
if (seleniumHub) {
try {
prefixedLog(`Getting remote browser...`);
driver = await getRemoteDriver({seleniumHub, chromiumVersion, proxyHost});
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is chromiumVersion in the expected format here? Is selenium also expecting "revision" instead of regular "version" number? I understand that we are reusing --chromium-version flag to pass it?

prefixedLog(`Got remote browser ${driver}`);
browserContext = await getPuppeteerContext(seleniumHub, driver);
} catch (e) {
prefixedLog(chalk.red(`Could not get a remote browser`), chalk.gray(e.message));
throw e;
}
}

try {
const data = await crawl(url, {
log: prefixedLog,
// @ts-ignore
collectors: dataCollectors.map(collector => new collector.constructor()),
filterOutFirstParty,
emulateMobile,
proxyHost,
runInEveryFrame: antiBotDetection ? notABot : undefined,
executablePath,
maxLoadTimeMs,
extraExecutionTimeMs,
collectorFlags,
browserContext,
});

dataCallback(url, data);
dataCallback(url, data);
} finally {
if (driver && !VISUAL_DEBUG) {
try {
await browserContext.browser().disconnect();
await driver.quit();
} catch (e) {
prefixedLog(chalk.red(`Could not clean up remote browser`), chalk.gray(e.message));
}
}
}
}

/**
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>}} options
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>, seleniumHub?: string}} options
*/
module.exports = async options => {
const deferred = createDeferred();
const log = options.logFunction || (() => {});
const failureCallback = options.failureCallback || (() => {});

let numberOfCrawlers = options.numberOfCrawlers || Math.floor(cores * 0.8);
numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers, options.urls.length);
numberOfCrawlers = Math.min(numberOfCrawlers, options.urls.length);
if (!options.seleniumHub) {
numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers);
}

// Increase number of listeners so we have at least one listener for each async process
if (numberOfCrawlers > process.getMaxListeners()) {
Expand All @@ -73,7 +106,7 @@ module.exports = async options => {
* @type {string}
*/
let executablePath;
if (options.chromiumVersion) {
if (options.chromiumVersion && !options.seleniumHub) {
executablePath = await downloadCustomChromium(log, options.chromiumVersion);
}

Expand All @@ -89,7 +122,7 @@ module.exports = async options => {
log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`));
const timer = createTimer();

const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags);
const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags, options.chromiumVersion, options.seleniumHub);

async.retry(MAX_NUMBER_OF_RETRIES, task, err => {
if (err) {
Expand Down
Loading