Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade puppeteer to latest version + selenium grid support #104

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
09922cb
Show absolute numbers in progress bar
muodov Jan 6, 2023
7a5bec9
Speed up url filtering
muodov Jan 12, 2023
c56379b
log how many URLs were skipped
muodov Jan 12, 2023
4983060
upgrade puppeteer (still old)
muodov Sep 3, 2024
fee07fd
WIP: manage browsers on a lower level
muodov Mar 23, 2023
40e8fc7
remove some selenium-related stuff
muodov Sep 4, 2024
dd615d7
download chromium to a local dir
muodov Mar 23, 2023
030866c
download chromium in crawler
muodov Mar 23, 2023
f2f9f23
Wait for full init on main page before reporting
muodov Mar 23, 2023
69f8154
remove unnecessary imports
muodov Sep 4, 2024
79510f5
Use raw CDP in CMP collector
muodov Sep 4, 2024
73c38db
Update chalk, node types and remove pre-push (tests are run in CI)
muodov Sep 4, 2024
b6ec60c
Remote unnecessary @types/stack-utils and upgrade tldts
muodov Sep 4, 2024
8cf124f
Upgrade commander
muodov Sep 4, 2024
4bd1822
Upgrade async library
muodov Sep 4, 2024
f4ae631
Fix some unit tests
muodov Sep 4, 2024
fc49507
Fix more tests after chromium upgrade
muodov Sep 5, 2024
fc83426
Add missing typings
muodov Sep 5, 2024
f4e80bf
Disable some API attribution tests due to a known bug
muodov Sep 5, 2024
ef7d909
Skip empty lines in url file
muodov Sep 5, 2024
0062e1e
Support latest puppeteer
muodov Sep 5, 2024
7a744d2
Download browser before parallelizing
muodov Sep 5, 2024
e5a1e9f
Lint fix
muodov Sep 5, 2024
8be3f9b
don't use detached processes
muodov Sep 5, 2024
cacd577
update puppeteer one more time
muodov Dec 4, 2024
a035239
Fix errors in launch procedure
muodov Dec 4, 2024
83eb5be
Better logging
muodov Dec 4, 2024
662be11
add time to logs
muodov Dec 4, 2024
95f6a3b
let puppeteer/browsers handle signals
muodov Dec 4, 2024
db16d69
Stagger the launch of initial browser batch
muodov Dec 5, 2024
564db6d
log crashed targets
muodov Dec 5, 2024
37da4e9
Update puppeteer
muodov Dec 5, 2024
d48349d
Add selenium support
muodov Dec 5, 2024
4569449
Remove the hard limit on the number of concurrent crawlers
muodov Dec 5, 2024
72f4b07
remove log message
muodov Dec 5, 2024
404f306
disable progress bar in CI
muodov Dec 6, 2024
da05532
Better CI log format
muodov Dec 6, 2024
d93f0f1
Add a CLI parameter for passing selenium hub
muodov Dec 6, 2024
5dec740
Lint
muodov Dec 8, 2024
3c8a778
upgrade types
muodov Dec 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ node_modules
.vscode/
data/
summary.json
chromium/
21 changes: 21 additions & 0 deletions browser/BaseBrowser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
class BaseBrowser {
/**
* @returns {Promise<void>}
*/
start() {
throw new Error('Not implemented');
}

close() {
throw new Error('Not implemented');
}

/**
* @returns {Promise<import('puppeteer-core').Connection>}
*/
getConnection() {
throw new Error('Not implemented');
}
}

module.exports = BaseBrowser;
193 changes: 193 additions & 0 deletions browser/LocalChrome.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
const {mkdtemp, rm} = require('fs/promises');
const {join} = require('path');
const {tmpdir} = require('os');
const {CDP_WEBSOCKET_ENDPOINT_REGEX, launch} = require('@puppeteer/browsers');

const {Connection} = require('puppeteer-core');

// INTERNAL puppeteer classes
const {ChromeLauncher} = require('puppeteer-core/lib/cjs/puppeteer/node/ChromeLauncher.js');
const {NodeWebSocketTransport} = require('puppeteer-core/lib/cjs/puppeteer/node/NodeWebSocketTransport.js');

const BaseBrowser = require("./BaseBrowser");

class LocalChrome extends BaseBrowser {
/**
* @param {BrowserOptions} options
*/
constructor(options) {
super();
this.options = options;
this.connection = null;
this.browserProcess = null;
this.userDataDir = null;
}

_getProfilePath() {
return join(
tmpdir(),
// '/dev/shm',
`tr_collector_chrome_profile-`
);
}

/**
* @returns {Promise<void>}
*/
async start() {
this.userDataDir = await mkdtemp(this._getProfilePath());

const devtools = !this.options.headless;
const headless = this.options.headless;

// At the time of writing, default args are:
// [
// '--allow-pre-commit-input',
// '--disable-background-networking',
// '--disable-background-timer-throttling',
// '--disable-backgrounding-occluded-windows',
// '--disable-breakpad',
// '--disable-client-side-phishing-detection',
// '--disable-component-extensions-with-background-pages',
// '--disable-default-apps',
// '--disable-dev-shm-usage', // overridden below
// '--disable-extensions',
// '--disable-hang-monitor',
// '--disable-infobars',
// '--disable-ipc-flooding-protection',
// '--disable-popup-blocking',
// '--disable-prompt-on-repost',
// '--disable-renderer-backgrounding',
// '--disable-search-engine-choice-screen',
// '--disable-sync',
// '--enable-automation',
// '--export-tagged-pdf',
// '--generate-pdf-document-outline',
// '--force-color-profile=srgb',
// '--metrics-recording-only',
// '--no-first-run',
// '--password-store=basic',
// '--use-mock-keychain',
// '--disable-features=Translate,AcceptCHFrame,MediaRouter,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,IsolateSandboxedIframes',
// '--enable-features=PdfOopif',
// '--headless=new', // depend on headless param
// '--hide-scrollbars', // depend on headless param
// '--mute-audio', // depend on headless param
// 'about:blank',
// ]
const chromeArguments = ChromeLauncher.prototype.defaultArgs({
devtools,
headless,
args: this.options.extraArgs,
userDataDir: this.userDataDir,
}).filter(arg => [
'--disable-dev-shm-usage', // see https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-1435707522
].includes(arg) === false);

chromeArguments.push(`--remote-debugging-port=0`);

const handleSIGINT = true;
const handleSIGTERM = true;
const handleSIGHUP = true;

const launchArgs = {
executablePath: this.options.executablePath,
args: chromeArguments,
userDataDir: this.userDataDir,
};

// console.log('chromeArguments', chromeArguments);

const onProcessExit = async () => {
try {
await rm(this.userDataDir, {
force: true,
recursive: true,
maxRetries: 5,
});
} catch (error) {
console.error('Error when deleting user data dir', error);
}
};

this.browserProcess = launch({
executablePath: launchArgs.executablePath,
detached: true,
env: process.env,
args: launchArgs.args,
handleSIGHUP,
handleSIGTERM,
handleSIGINT,
dumpio: true, // set to true to connect stdio from the browser process to the current process
pipe: false,
onExit: onProcessExit,
});
}

/**
* @returns {Promise<void>}
*/
async close() {
if (!this.browserProcess) {
throw new Error('Browser is not running');
}
if (this.closing) {
return;
}
this.closing = true;
if (this.connection) {
// Attempt to close the browser gracefully
try {
await this.connection.send('Browser.close');
await this.browserProcess.hasClosed();
} catch (error) {
console.error('Error when closing browser connection', error);
await this.browserProcess.close();
}
this.connection.dispose();
} else {
await this.browserProcess.close();
}
}

/**
* @returns {Promise<BrowserConnection>}
*/
async getConnection() {
try {
const wsTimeout = 30000;
const browserWSEndpoint = await this.browserProcess.waitForLineOutput(
CDP_WEBSOCKET_ENDPOINT_REGEX,
wsTimeout
);
const transport = await NodeWebSocketTransport.create(browserWSEndpoint);
let slowMo; // override for debugging
let protocolTimeout; // override for debugging
this.connection = new Connection(
browserWSEndpoint,
transport,
slowMo,
protocolTimeout
);
return this.connection;
} catch (e) {
console.log('error setting up connection', e);
this.close();
throw e;
}
}
}

module.exports = LocalChrome;

/**
* @typedef BrowserOptions
* @property {any=} viewport
* @property {string=} executablePath
* @property {string[]=} extraArgs
* @property {boolean=} headless
*/

/**
* @typedef {import('puppeteer-core').Connection} BrowserConnection
*/
146 changes: 146 additions & 0 deletions browser/RemoteChrome.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
const {Builder} = require("selenium-webdriver");

Check failure on line 1 in browser/RemoteChrome.js

View workflow job for this annotation

GitHub Actions / build (16.x)

Unused '@ts-expect-error' directive.

Check failure on line 1 in browser/RemoteChrome.js

View workflow job for this annotation

GitHub Actions / build (20.x)

Unused '@ts-expect-error' directive.
const chrome = require("selenium-webdriver/chrome");

const {Connection} = require('puppeteer-core');

// INTERNAL puppeteer classes
const {ChromeLauncher} = require('puppeteer-core/lib/cjs/puppeteer/node/ChromeLauncher.js');
const {NodeWebSocketTransport} = require('puppeteer-core/lib/cjs/puppeteer/node/NodeWebSocketTransport.js');

const BaseBrowser = require("./BaseBrowser");

class RemoteChrome extends BaseBrowser {
/**
* @param {SeleniumOptions} options
*/
constructor(options) {
super();
this.options = options;
this.connection = null;
this.driver = null;
}

getArguments() {
// At the time of writing, default args are:
// [
// '--allow-pre-commit-input',
// '--disable-background-networking',
// '--disable-background-timer-throttling',
// '--disable-backgrounding-occluded-windows',
// '--disable-breakpad',
// '--disable-client-side-phishing-detection',
// '--disable-component-extensions-with-background-pages',
// '--disable-default-apps',
// '--disable-dev-shm-usage',
// '--disable-extensions',
// '--disable-hang-monitor',
// '--disable-infobars',
// '--disable-ipc-flooding-protection',
// '--disable-popup-blocking',
// '--disable-prompt-on-repost',
// '--disable-renderer-backgrounding',
// '--disable-search-engine-choice-screen',
// '--disable-sync',
// '--enable-automation',
// '--export-tagged-pdf',
// '--generate-pdf-document-outline',
// '--force-color-profile=srgb',
// '--metrics-recording-only',
// '--no-first-run',
// '--password-store=basic',
// '--use-mock-keychain',
// '--disable-features=Translate,AcceptCHFrame,MediaRouter,OptimizationHints,ProcessPerSiteUpToMainFrameThreshold,IsolateSandboxedIframes',
// '--enable-features=PdfOopif',
// '--headless=new', // depend on headless param
// '--hide-scrollbars', // depend on headless param
// '--mute-audio', // depend on headless param
// 'about:blank',
// ]
const chromeArguments = ChromeLauncher.prototype.defaultArgs({
headless: false, // selenium will run headful browsers
args: this.options.extraArgs,
}).filter(arg => [
// '--disable-dev-shm-usage', // see https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-1435707522
'about:blank',
].includes(arg) === false);
return chromeArguments;
}

/**
* @returns {Promise<void>}
*/
async start() {
const chromeArguments = this.getArguments();
const opts = new chrome.Options();
opts.addArguments(...chromeArguments);

opts.setUserPreferences({
"download.default_directory": "/dev/null",
});

this.driver = await (new Builder()
.usingServer(this.options.seleniumHub)
.forBrowser('chrome')
.setChromeOptions(opts)
.build());
}

/**
* @returns {Promise<void>}
*/
async close() {
if (this.closing) {
return;
}
this.closing = true;
if (this.connection) {
// Attempt to close the browser gracefully
try {
await this.connection.send('Browser.close');
} catch (error) {
console.error('Error when closing browser connection', error);
}
this.connection.dispose();
}
await this.driver?.quit();
}

/**
* @returns {Promise<BrowserConnection>}
*/
async getConnection() {
try {
const seleniumHost = new URL(this.options.seleniumHub).host;
// @ts-expect-error session has the 'any' type
const sessionId = await this.driver.getSession().then(session => session.getId());
const browserWSEndpoint = `ws://${seleniumHost}/session/${sessionId}/se/cdp`;
const transport = await NodeWebSocketTransport.create(browserWSEndpoint);

let slowMo; // override for debugging
let protocolTimeout; // override for debugging
this.connection = new Connection(
browserWSEndpoint,
transport,
slowMo,
protocolTimeout
);
return this.connection;
} catch (e) {
console.log('error setting up remote connection', e);
this.close();
throw e;
}
}
}

module.exports = RemoteChrome;

/**
* @typedef SeleniumOptions
* @property {string[]=} extraArgs
* @property {string} seleniumHub
*/

/**
* @typedef {import('puppeteer-core').Connection} BrowserConnection
*/
Loading
Loading