Skip to content

Commit

Permalink
Add selenium support
Browse files Browse the repository at this point in the history
  • Loading branch information
muodov committed Dec 5, 2024
1 parent 37da4e9 commit d48349d
Show file tree
Hide file tree
Showing 8 changed files with 443 additions and 36 deletions.
8 changes: 0 additions & 8 deletions browser/BaseBrowser.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,3 @@ class BaseBrowser {
}

module.exports = BaseBrowser;

/**
* @typedef BrowserOptions
* @property {any=} viewport
* @property {string=} executablePath
* @property {string[]=} extraArgs
* @property {boolean=} headless
*/
6 changes: 5 additions & 1 deletion browser/LocalChrome.js
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,11 @@ class LocalChrome extends BaseBrowser {
module.exports = LocalChrome;

/**
* @typedef {import('./BaseBrowser').BrowserOptions} BrowserOptions
* @typedef BrowserOptions
* @property {any=} viewport
* @property {string=} executablePath
* @property {string[]=} extraArgs
* @property {boolean=} headless
*/

/**
Expand Down
146 changes: 146 additions & 0 deletions browser/RemoteChrome.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
const {Builder} = require("selenium-webdriver");
const chrome = require("selenium-webdriver/chrome");

const {Connection} = require('puppeteer-core');

// INTERNAL puppeteer classes
const {ChromeLauncher} = require('puppeteer-core/lib/cjs/puppeteer/node/ChromeLauncher.js');
const {NodeWebSocketTransport} = require('puppeteer-core/lib/cjs/puppeteer/node/NodeWebSocketTransport.js');

const BaseBrowser = require("./BaseBrowser");

class RemoteChrome extends BaseBrowser {
/**
* @param {SeleniumOptions} options
*/
constructor(options) {
super();
this.options = options;
this.connection = null;
this.driver = null;
}

getArguments() {
// At the time of writing, default args are:
// [
// '--allow-pre-commit-input',
// '--disable-background-networking',
// '--disable-background-timer-throttling',
// '--disable-backgrounding-occluded-windows',
// '--disable-breakpad',
// '--disable-client-side-phishing-detection',
// '--disable-component-extensions-with-background-pages',
// '--disable-default-apps',
// '--disable-dev-shm-usage',
// '--disable-extensions',
// '--disable-hang-monitor',
// '--disable-infobars',
// '--disable-ipc-flooding-protection',
// '--disable-popup-blocking',
// '--disable-prompt-on-repost',
// '--disable-renderer-backgrounding',
// '--disable-search-engine-choice-screen',
// '--disable-sync',
// '--enable-automation',
// '--export-tagged-pdf',
// '--generate-pdf-document-outline',
// '--force-color-profile=srgb',
// '--metrics-recording-only',
// '--no-first-run',
// '--password-store=basic',
// '--use-mock-keychain',
// '--disable-features=Translate,AcceptCHFrame,MediaRouter,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,IsolateSandboxedIframes',
// '--enable-features=PdfOopif',
// '--headless=new', // depend on headless param
// '--hide-scrollbars', // depend on headless param
// '--mute-audio', // depend on headless param
// 'about:blank',
// ]
const chromeArguments = ChromeLauncher.prototype.defaultArgs({
headless: false, // selenium will run headful browsers
args: this.options.extraArgs,
}).filter(arg => [
// '--disable-dev-shm-usage', // see https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-1435707522
'about:blank',
].includes(arg) === false);
return chromeArguments;
}

/**
* @returns {Promise<void>}
*/
async start() {
const chromeArguments = this.getArguments();
const opts = new chrome.Options();
opts.addArguments(...chromeArguments);

opts.setUserPreferences({
"download.default_directory": "/dev/null",
});

this.driver = await (new Builder()
.usingServer(this.options.seleniumHub)
.forBrowser('chrome')
.setChromeOptions(opts)
.build());
}

/**
* @returns {Promise<void>}
*/
async close() {
if (this.closing) {
return;
}
this.closing = true;
if (this.connection) {
// Attempt to close the browser gracefully
try {
await this.connection.send('Browser.close');
} catch (error) {
console.error('Error when closing browser connection', error);
}
this.connection.dispose();
}
await this.driver?.quit();
}

/**
* @returns {Promise<BrowserConnection>}
*/
async getConnection() {
try {
const seleniumHost = new URL(this.options.seleniumHub).host;
// @ts-expect-error session has the 'any' type
const sessionId = await this.driver.getSession().then(session => session.getId());
const browserWSEndpoint = `ws://${seleniumHost}/session/${sessionId}/se/cdp`;
const transport = await NodeWebSocketTransport.create(browserWSEndpoint);

let slowMo; // override for debugging
let protocolTimeout; // override for debugging
this.connection = new Connection(
browserWSEndpoint,
transport,
slowMo,
protocolTimeout
);
return this.connection;
} catch (e) {
console.log('error setting up remote connection', e);
this.close();
throw e;
}
}
}

module.exports = RemoteChrome;

/**
* @typedef SeleniumOptions
* @property {string[]=} extraArgs
* @property {string} seleniumHub
*/

/**
* @typedef {import('puppeteer-core').Connection} BrowserConnection
*/
15 changes: 13 additions & 2 deletions browser/openBrowser.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
const {VISUAL_DEBUG} = require('../constants');
const {downloadChrome} = require('../helpers/chromiumDownload');
const LocalChrome = require('./LocalChrome');
const RemoteChrome = require('./RemoteChrome');

/**
* @param {function(...any):void} log
* @param {string} proxyHost
* @param {string} executablePath path to chromium executable to use
* @returns {Promise<LocalChrome>}
* @param {string} [seleniumHub] selenium hub url
* @returns {Promise<import('./BaseBrowser')>}
*/
async function openBrowser(log, proxyHost, executablePath) {
async function openBrowser(log, proxyHost, executablePath, seleniumHub) {
const extraArgs = [
// enable FLoC
// '--enable-blink-features=InterestCohortAPI',
Expand All @@ -28,6 +30,15 @@ async function openBrowser(log, proxyHost, executablePath) {
extraArgs.push(`--host-resolver-rules=MAP * ~NOTFOUND, EXCLUDE ${url.hostname}`); // no quotes around the CLI flags needed
}

if (seleniumHub) {
const seleniumBrowser = new RemoteChrome({
extraArgs,
seleniumHub,
});
await seleniumBrowser.start();
return seleniumBrowser;
}

const browser = new LocalChrome({
extraArgs,
headless: !VISUAL_DEBUG,
Expand Down
8 changes: 7 additions & 1 deletion crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,13 @@ function isThirdPartyRequest(documentUrl, requestUrl) {
*/
async function crawl(url, options) {
const log = options.log || (() => {});
const browser = options.browserConnection ? null : await openBrowser(log, options.proxyHost, options.executablePath);
const browser = options.browserConnection ? null : await openBrowser(
log,
options.proxyHost,
options.executablePath,
// FIXME: this is a hardcoded value
'http://10.100.9.21:4444'
);
const browserConnection = options.browserConnection || await browser.getConnection();

let data = null;
Expand Down
54 changes: 34 additions & 20 deletions crawlerConductor.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,28 +91,42 @@ module.exports = async options => {
dataCollectors = urlItem.dataCollectors;
}

const staggerDelay = Number(idx) < numberOfCrawlers ? 2000 * Number(idx) : 0;
// stagger the start of the first browsers so they don't eat memory all at once
setTimeout(() => {
inProgress.add(urlString);
log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`));
const timer = createTimer();
// const staggerDelay = Number(idx) < numberOfCrawlers ? 2000 * Number(idx) : 0;
// // stagger the start of the first browsers so they don't eat memory all at once
// setTimeout(() => {
inProgress.add(urlString);
log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`));
const timer = createTimer();

const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.collectorFlags);
const task = crawlAndSaveData.bind(
null,
urlString,
dataCollectors,
log,
options.filterOutFirstParty,
options.dataCallback,
options.emulateMobile,
options.proxyHost,
(options.antiBotDetection !== false),
executablePath,
options.maxLoadTimeMs,
options.extraExecutionTimeMs,
options.collectorFlags
);

asyncLib.retry(MAX_NUMBER_OF_RETRIES, task, err => {
if (err) {
console.log(err);
log(chalk.red(`Max number of retries (${MAX_NUMBER_OF_RETRIES}) exceeded for "${urlString}".`));
failureCallback(urlString, err);
} else {
log(chalk.cyan(`Processing "${urlString}" took ${timer.getElapsedTime()}s.`));
}
inProgress.delete(urlString);
log(chalk.cyan(`In progress (${inProgress.size}): ${Array.from(inProgress).join(', ')}`));
callback();
});
}, staggerDelay);
asyncLib.retry(MAX_NUMBER_OF_RETRIES, task, err => {
if (err) {
console.log(err);
log(chalk.red(`Max number of retries (${MAX_NUMBER_OF_RETRIES}) exceeded for "${urlString}".`));
failureCallback(urlString, err);
} else {
log(chalk.cyan(`Processing "${urlString}" took ${timer.getElapsedTime()}s.`));
}
inProgress.delete(urlString);
log(chalk.cyan(`In progress (${inProgress.size}): ${Array.from(inProgress).join(', ')}`));
callback();
});
// }, staggerDelay);
});
};

Expand Down
Loading

0 comments on commit d48349d

Please sign in to comment.