duckduckgo · muodov · Dec 21, 2022 · Jan 6, 2023 · Jan 6, 2023 · Jan 6, 2023
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Available options:
 - `-r, --region-code <region>` - optional 2 letter region code. For metadata only
 - `-a, --disable-anti-bot` - disable simple build-in anti bot detection script injected to every frame
 - `--chromium-version <version_number>` - use custom version of Chromium (e.g. "843427") instead of using the default
+- `--selenium-hub <url>` - instead of running a local browser, request a remote browser at the provided Selenium hub endpoint
 - `--config <path>` - path to a config file that allows to set all the above settings (and more). Note that CLI flags have a higher priority than settings passed via config. You can find a sample config file in `tests/cli/sampleConfig.json`.
 - `--autoconsent-action <action>` - automatic autoconsent action (requires the `cmps` collector). Possible values: optIn, optOut
 

diff --git a/cli/crawl-cli.js b/cli/crawl-cli.js
@@ -2,6 +2,7 @@
 const path = require('path');
 const fs = require('fs');
 const chalk = require('chalk').default;
+const asyncLib = require('async');
 const runCrawlers = require('../crawlerConductor');
 const program = require('commander');
 const URL = require('url').URL;
@@ -34,8 +35,65 @@ program
     .option('--config <path>', 'crawl configuration file')
     .option('--autoconsent-action <action>', 'dismiss cookie popups. Possible values: optout, optin')
     .option('--chromium-version <version_number>', 'use custom version of chromium')
+    .option('--selenium-hub <url>', 'selenium hub endpoint to request browsers from')
     .parse(process.argv);
 
+/**
+ * @param {string} outputPath
+ * @param {URL} url
+ * @param {string} fileType file extension, defaults to 'json'
+ */
+function createOutputPath(outputPath, url, fileType = 'json') {
+    return path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`);
+}
+
+/**
+ * @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls
+ * @param {function} logFunction
+ * @param {string} outputPath
+ */
+function filterUrls(inputUrls, logFunction, outputPath) {
+    return new Promise((resolveFilterUrls, rejectFilterUrls) => {
+        asyncLib.filter(inputUrls, (item, filterCallback) => {
+            const urlString = (typeof item === 'string') ? item : item.url;
+
+            /**
+             * @type {URL}
+             */
+            let url;
+
+            try {
+                url = new URL(urlString);
+            } catch (e) {
+                logFunction(chalk.yellow('Invalid URL:'), urlString);
+                filterCallback(null, false);
+                return;
+            }
+
+            if (outputPath) {
+                // filter out entries for which result file already exists
+                const outputFile = createOutputPath(outputPath, url);
+                fs.access(outputFile, err => {
+                    if (err) {
+                        filterCallback(null, true);
+                    } else {
+                        logFunction(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
+                        filterCallback(null, false);
+                    }
+                });
+                return;
+            }
+            filterCallback(null, true);
+        }, (err, results) => {
+            if (err) {
+                logFunction(chalk.red(`Could not filter URL list: ${err}`));
+                rejectFilterUrls(err);
+            }
+            resolveFilterUrls(results);
+        });
+    });
+}
+
 /**
  * @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls
  * @param {string} outputPath
@@ -54,8 +112,9 @@ program
  * @param {number} maxLoadTimeMs
  * @param {number} extraExecutionTimeMs
  * @param {Object.<string, boolean>} collectorFlags
+ * @param {string} seleniumHub
  */
-async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
+async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, dataCollectors, reporters, forceOverwrite, filterOutFirstParty, emulateMobile, proxyHost, regionCode, antiBotDetection, chromiumVersion, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, seleniumHub) {
     const startTime = new Date();
 
     reporters.forEach(reporter => {
@@ -71,39 +130,8 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
         });
     };
 
-    /**
-     * @type {function(...any):string}
-     * @param {URL} url
-     * @param {string} fileType file extension, defaults to 'json'
-     */
-    const createOutputPath = ((url, fileType = 'json') => path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`));
-
-    const urls = inputUrls.filter(item => {
-        const urlString = (typeof item === 'string') ? item : item.url;
-
-        /**
-         * @type {URL}
-         */
-        let url;
-
-        try {
-            url = new URL(urlString);
-        } catch(e) {
-            log(chalk.yellow('Invalid URL:'), urlString);
-            return false;
-        }
-
-        if (forceOverwrite !== true) {
-            // filter out entries for which result file already exists
-            const outputFile = createOutputPath(url);
-            if (fs.existsSync(outputFile)) {
-                log(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
-                return false;
-            }
-        }
-
-        return true;
-    });
+    const urls = await filterUrls(inputUrls, log, forceOverwrite === true ? null : outputPath);
+    log(chalk.yellow(`Skipped ${inputUrls.length - urls.length} URLs`));
 
     const urlsLength = urls.length;
     let failures = 0;
@@ -135,11 +163,11 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
 
         crawlTimes.push([data.testStarted, data.testFinished, data.testFinished - data.testStarted]);
 
-        const outputFile = createOutputPath(url);
+        const outputFile = createOutputPath(outputPath, url);
 
         // move screenshot to its own file and only keep screenshot path in the JSON data
         if (data.data.screenshots) {
-            const screenshotFilename = createOutputPath(url, 'jpg');
+            const screenshotFilename = createOutputPath(outputPath, url, 'jpg');
             fs.writeFileSync(screenshotFilename, Buffer.from(data.data.screenshots, 'base64'));
 
             data.data.screenshots = screenshotFilename;
@@ -174,6 +202,7 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
             maxLoadTimeMs,
             extraExecutionTimeMs,
             collectorFlags,
+            seleniumHub,
         });
         log(chalk.green('\n✅ Finished successfully.'));
     } catch(e) {
@@ -257,5 +286,5 @@ if (!config.urls || !config.output) {
         return item;
     });
 
-    run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags);
+    run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags, config.seleniumHub);
 }
diff --git a/cli/crawlConfig.js b/cli/crawlConfig.js
@@ -14,7 +14,7 @@ function addProtocolIfNeeded(url) {
 /**
  * Looks at CLI flags, JSON config etc. to figure out the final crawl config
  * 
- * @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags 
+ * @param {{config?: string, verbose?: boolean, forceOverwrite?: boolean, only3p?: boolean, mobile?: boolean, disableAntiBot?: boolean, output?: string, logPath?: string, crawlers?: string, proxyConfig?: string, regionCode?: string, chromiumVersion?: string, seleniumHub?: string, dataCollectors?: string, reporters?: string, url?: string, inputList?: string}} flags 
  * @returns {CrawlConfig}
  */
 function figureOut(flags) {
@@ -65,6 +65,9 @@ function figureOut(flags) {
     if (flags.chromiumVersion) {
         crawlConfig.chromiumVersion = flags.chromiumVersion;
     }
+    if (flags.seleniumHub) {
+        crawlConfig.seleniumHub = flags.seleniumHub;
+    }
 
     // array settings
     if (flags.dataCollectors) {
@@ -130,6 +133,7 @@ module.exports = {
  * @property {string} proxyConfig
  * @property {string} regionCode
  * @property {string} chromiumVersion
+ * @property {string} seleniumHub
  * @property {boolean} filterOutFirstParty
  * @property {boolean} forceOverwrite
  * @property {boolean} verbose

diff --git a/constants.js b/constants.js
@@ -0,0 +1,25 @@
+const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36';
+const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 10; Pixel 2 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36';
+
+const DEFAULT_VIEWPORT = {
+    width: 1440,//px
+    height: 812//px
+};
+const MOBILE_VIEWPORT = {
+    width: 412,
+    height: 691,
+    deviceScaleFactor: 2,
+    isMobile: true,
+    hasTouch: true
+};
+
+// for debugging: will lunch in window mode instad of headless, open devtools and don't close windows after process finishes
+const VISUAL_DEBUG = false;
+
+module.exports = {
+    DEFAULT_USER_AGENT,
+    MOBILE_USER_AGENT,
+    DEFAULT_VIEWPORT,
+    MOBILE_VIEWPORT,
+    VISUAL_DEBUG,
+};
diff --git a/crawler.js b/crawler.js
@@ -4,24 +4,7 @@ const chalk = require('chalk').default;
 const {createTimer} = require('./helpers/timer');
 const wait = require('./helpers/wait');
 const tldts = require('tldts');
-
-const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36';
-const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 10; Pixel 2 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Mobile Safari/537.36';
-
-const DEFAULT_VIEWPORT = {
-    width: 1440,//px
-    height: 812//px
-};
-const MOBILE_VIEWPORT = {
-    width: 412,
-    height: 691,
-    deviceScaleFactor: 2,
-    isMobile: true,
-    hasTouch: true
-};
-
-// for debugging: will lunch in window mode instad of headless, open devtools and don't close windows after process finishes
-const VISUAL_DEBUG = false;
+const {DEFAULT_USER_AGENT, MOBILE_USER_AGENT, DEFAULT_VIEWPORT, MOBILE_VIEWPORT, VISUAL_DEBUG} = require('./constants');
 
 /**
  * @param {function(...any):void} log

diff --git a/crawlerConductor.js b/crawlerConductor.js
@@ -10,58 +10,91 @@ const downloadCustomChromium = require('./helpers/downloadCustomChromium');
 // eslint-disable-next-line no-unused-vars
 const BaseCollector = require('./collectors/BaseCollector');
 const notABot = require('./helpers/notABot');
+const {getRemoteDriver, getPuppeteerContext} = require('./helpers/selenium');
+const {VISUAL_DEBUG} = require('./constants');
 
 const MAX_NUMBER_OF_CRAWLERS = 38;// by trial and error there seems to be network bandwidth issues with more than 38 browsers. 
 const MAX_NUMBER_OF_RETRIES = 2;
 
 /**
- * @param {string} urlString 
+ * @param {string} urlString
  * @param {BaseCollector[]} dataCollectors
- * @param {function} log 
+ * @param {function} log
  * @param {boolean} filterOutFirstParty
- * @param {function(URL, import('./crawler').CollectResult): void} dataCallback 
+ * @param {function(URL, import('./crawler').CollectResult): void} dataCallback
  * @param {boolean} emulateMobile
  * @param {string} proxyHost
  * @param {boolean} antiBotDetection
  * @param {string} executablePath
  * @param {number} maxLoadTimeMs
  * @param {number} extraExecutionTimeMs
  * @param {Object.<string, string>} collectorFlags
+ * @param {string} chromiumVersion
+ * @param {string} seleniumHub
  */
-async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags) {
+async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, collectorFlags, chromiumVersion, seleniumHub) {
     const url = new URL(urlString);
     /**
-     * @type {function(...any):void} 
+     * @type {function(...any):void}
      */
     const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg);
 
-    const data = await crawl(url, {
-        log: prefixedLog,
-        // @ts-ignore
-        collectors: dataCollectors.map(collector => new collector.constructor()),
-        filterOutFirstParty,
-        emulateMobile,
-        proxyHost,
-        runInEveryFrame: antiBotDetection ? notABot : undefined,
-        executablePath,
-        maxLoadTimeMs,
-        extraExecutionTimeMs,
-        collectorFlags,
-    });
+    let browserContext = null;
+    let driver = null;
+    if (seleniumHub) {
+        try {
+            prefixedLog(`Getting remote browser...`);
+            driver = await getRemoteDriver({seleniumHub, chromiumVersion, proxyHost});
+            prefixedLog(`Got remote browser ${driver}`);
+            browserContext = await getPuppeteerContext(seleniumHub, driver);
+        } catch (e) {
+            prefixedLog(chalk.red(`Could not get a remote browser`), chalk.gray(e.message));
+            throw e;
+        }
+    }
+
+    try {
+        const data = await crawl(url, {
+            log: prefixedLog,
+            // @ts-ignore
+            collectors: dataCollectors.map(collector => new collector.constructor()),
+            filterOutFirstParty,
+            emulateMobile,
+            proxyHost,
+            runInEveryFrame: antiBotDetection ? notABot : undefined,
+            executablePath,
+            maxLoadTimeMs,
+            extraExecutionTimeMs,
+            collectorFlags,
+            browserContext,
+        });
 
-    dataCallback(url, data);
+        dataCallback(url, data);
+    } finally {
+        if (driver && !VISUAL_DEBUG) {
+            try {
+                await browserContext.browser().disconnect();
+                await driver.quit();
+            } catch (e) {
+                prefixedLog(chalk.red(`Could not clean up remote browser`), chalk.gray(e.message));
+            }
+        }
+    }
 }
 
 /**
- * @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>}} options
+ * @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, collectorFlags?: Object.<string, boolean>, seleniumHub?: string}} options
  */
 module.exports = async options => {
     const deferred = createDeferred();
     const log = options.logFunction || (() => {});
     const failureCallback = options.failureCallback || (() => {});
 
     let numberOfCrawlers = options.numberOfCrawlers || Math.floor(cores * 0.8);
-    numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers, options.urls.length);
+    numberOfCrawlers = Math.min(numberOfCrawlers, options.urls.length);
+    if (!options.seleniumHub) {
+        numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers);
+    }
 
     // Increase number of listeners so we have at least one listener for each async process
     if (numberOfCrawlers > process.getMaxListeners()) {
@@ -73,7 +106,7 @@ module.exports = async options => {
      * @type {string}
      */
     let executablePath;
-    if (options.chromiumVersion) {
+    if (options.chromiumVersion && !options.seleniumHub) {
         executablePath = await downloadCustomChromium(log, options.chromiumVersion);
     }
 
@@ -89,7 +122,7 @@ module.exports = async options => {
         log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`));
         const timer = createTimer();
 
-        const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags);
+        const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags, options.chromiumVersion, options.seleniumHub);
 
         async.retry(MAX_NUMBER_OF_RETRIES, task, err => {
             if (err) {