Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade puppeteer and other dependencies #102

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ node_modules
.vscode/
data/
summary.json
chromium/
29 changes: 29 additions & 0 deletions browser/BaseBrowser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
class BaseBrowser {
/**
* @returns {Promise<void>}
*/
start() {
throw new Error('Not implemented');
}

close() {
throw new Error('Not implemented');
}

/**
* @returns {Promise<import('puppeteer-core/lib/cjs/puppeteer/common/Connection').Connection>}
*/
getConnection() {
throw new Error('Not implemented');
}
}

module.exports = BaseBrowser;

/**
* @typedef BrowserOptions
* @property {any=} viewport
* @property {string=} executablePath
* @property {string[]=} extraArgs
* @property {boolean=} headless
*/
109 changes: 109 additions & 0 deletions browser/LocalChrome.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
const {mkdtemp} = require('fs/promises');
const {join} = require('path');
const {tmpdir} = require('os');
const {BrowserRunner} = require('puppeteer-core/lib/cjs/puppeteer/node/BrowserRunner.js');
const {ChromeLauncher} = require('puppeteer-core/lib/cjs/puppeteer/node/ChromeLauncher.js');

const BaseBrowser = require("./BaseBrowser");
const {DEFAULT_VIEWPORT} = require('../constants');

class LocalChrome extends BaseBrowser {
/**
* @param {BrowserOptions} options
*/
constructor(options) {
super();
this.options = options;
this.connection = null;
this.runner = null;
this.userDataDir = null;
/** @type import('puppeteer-core/lib/cjs/puppeteer/node/LaunchOptions') */
this.launchOptions = {
ignoreDefaultArgs: false,
args: options.extraArgs,
dumpio: false,
pipe: false,
// eslint-disable-next-line no-process-env
env: process.env,
handleSIGINT: true,
handleSIGTERM: true,
handleSIGHUP: true,
ignoreHTTPSErrors: false,
defaultViewport: options.viewport || DEFAULT_VIEWPORT,
slowMo: 0,
timeout: 30000,
waitForInitialPage: true,
channel: undefined,
executablePath: options.executablePath,
debuggingPort: undefined,
protocol: undefined,
};
}

_getProfilePath() {
return join(
tmpdir(),
`tr_collector_chrome_profile-`
);
}

/**
* @returns {Promise<void>}
*/
async start() {
this.userDataDir = await mkdtemp(this._getProfilePath());

const devtools = !this.options.headless;
const headless = this.options.headless ? 'new' : false;

const chromeArguments = ChromeLauncher.prototype.defaultArgs({
devtools,
headless,
args: this.options.extraArgs,
userDataDir: this.userDataDir,
});
chromeArguments.push(`--remote-debugging-port=0`);

this.runner = new BrowserRunner('chrome', this.options.executablePath, chromeArguments, this.userDataDir, true);
this.runner.start(this.launchOptions);
}

/**
* @returns {Promise<void>}
*/
async close() {
if (!this.runner.proc) {
throw new Error('Browser is not running');
}
await this.runner.close();
}

/**
* @returns {Promise<BrowserConnection>}
*/
async getConnection() {
try {
this.connection = await this.runner.setupConnection({
timeout: 30000,
slowMo: 0,
preferredRevision: '<SEE_PUPPETEER_SOURCE>',
usePipe: false,
});
return this.connection;
} catch (e) {
console.log('error setting up connection', e);
this.runner.kill();
throw e;
}
}
}

module.exports = LocalChrome;

/**
* @typedef {import('./BaseBrowser').BrowserOptions} BrowserOptions
*/

/**
* @typedef {import('puppeteer-core/lib/cjs/puppeteer/common/Connection').Connection} BrowserConnection
*/
40 changes: 40 additions & 0 deletions browser/openBrowser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
const {VISUAL_DEBUG} = require('../constants');
const {getDefaultChromium} = require('../helpers/chromiumDownload');
const LocalChrome = require('./LocalChrome');

/**
* @param {function(...any):void} log
* @param {string} proxyHost
* @param {string} executablePath path to chromium executable to use
* @returns {Promise<LocalChrome>}
*/
async function openBrowser(log, proxyHost, executablePath) {
const extraArgs = [
// enable FLoC
'--enable-blink-features=InterestCohortAPI',
'--enable-features="FederatedLearningOfCohorts:update_interval/10s/minimum_history_domain_size_required/1,FlocIdSortingLshBasedComputation,InterestCohortFeaturePolicy"',
'--js-flags="--async-stack-traces --stack-trace-limit 32"'
];
if (proxyHost) {
let url;
try {
url = new URL(proxyHost);
} catch(e) {
log('Invalid proxy URL');
}

extraArgs.push(`--proxy-server=${proxyHost}`);
extraArgs.push(`--host-resolver-rules="MAP * ~NOTFOUND , EXCLUDE ${url.hostname}"`);
}

const browser = new LocalChrome({
extraArgs,
headless: !VISUAL_DEBUG,
executablePath: executablePath || await getDefaultChromium(log),
});
await browser.start();

return browser;
}

module.exports = openBrowser;
113 changes: 68 additions & 45 deletions cli/crawl-cli.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
/* eslint-disable max-lines */
const path = require('path');
const fs = require('fs');
const chalk = require('chalk').default;
const chalk = require('chalk');
const asyncLib = require('async');
const runCrawlers = require('../crawlerConductor');
const program = require('commander');
const URL = require('url').URL;
const {program} = require('commander');
const {getCollectorIds, createCollector} = require('../helpers/collectorsList');
const {getReporterIds, createReporter} = require('../helpers/reportersList');
const {metadataFileExists, createMetadataFile} = require('./metadataFile');
const crawlConfig = require('./crawlConfig');
const {createUniqueUrlName} = require('../helpers/hash');

// eslint-disable-next-line @typescript-eslint/no-unused-vars
const BaseCollector = require('../collectors/BaseCollector');
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const BaseReporter = require('../reporters/BaseReporter');

program
.option('-o, --output <path>', 'output folder')
.option('-u, --url <url>', 'single URL')
Expand All @@ -36,6 +31,57 @@ program
.option('--chromium-version <version_number>', 'use custom version of chromium')
.parse(process.argv);

/**
* @param {string} outputPath
* @param {URL} url
* @param {string} fileType file extension, defaults to 'json'
*/
function createOutputPath(outputPath, url, fileType = 'json') {
return path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`);
}

/**
* @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls
* @param {function} logFunction
* @param {string} outputPath
*/
function filterUrls(inputUrls, logFunction, outputPath) {
return asyncLib.filter(inputUrls, (item, filterCallback) => {
const urlString = (typeof item === 'string') ? item : item.url;

/**
* @type {URL}
*/
let url;

try {
url = new URL(urlString);
} catch {
logFunction(chalk.yellow('Invalid URL:'), urlString);
filterCallback(null, false);
return;
}

if (outputPath) {
// filter out entries for which result file already exists
const outputFile = createOutputPath(outputPath, url);
fs.access(outputFile, err => {
if (err) {
filterCallback(null, true);
} else {
logFunction(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
filterCallback(null, false);
}
});
return;
}
filterCallback(null, true);
}).catch(err => {
logFunction(chalk.red(`Could not filter URL list: ${err}`));
throw err;
});
}

/**
* @param {Array<string|{url:string, dataCollectors?:BaseCollector[]}>} inputUrls
* @param {string} outputPath
Expand Down Expand Up @@ -71,39 +117,8 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
});
};

/**
* @type {function(...any):string}
* @param {URL} url
* @param {string} fileType file extension, defaults to 'json'
*/
const createOutputPath = ((url, fileType = 'json') => path.join(outputPath, `${createUniqueUrlName(url)}.${fileType}`));

const urls = inputUrls.filter(item => {
const urlString = (typeof item === 'string') ? item : item.url;

/**
* @type {URL}
*/
let url;

try {
url = new URL(urlString);
} catch {
log(chalk.yellow('Invalid URL:'), urlString);
return false;
}

if (forceOverwrite !== true) {
// filter out entries for which result file already exists
const outputFile = createOutputPath(url);
if (fs.existsSync(outputFile)) {
log(chalk.yellow(`Skipping "${urlString}" because output file already exists.`));
return false;
}
}

return true;
});
const urls = await filterUrls(inputUrls, log, forceOverwrite === true ? null : outputPath);
log(chalk.yellow(`Skipped ${inputUrls.length - urls.length} URLs`));

const urlsLength = urls.length;
let failures = 0;
Expand Down Expand Up @@ -135,11 +150,11 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da

crawlTimes.push([data.testStarted, data.testFinished, data.testFinished - data.testStarted]);

const outputFile = createOutputPath(url);
const outputFile = createOutputPath(outputPath, url);

// move screenshot to its own file and only keep screenshot path in the JSON data
if (data.data.screenshots) {
const screenshotFilename = createOutputPath(url, 'jpg');
const screenshotFilename = createOutputPath(outputPath, url, 'jpg');
fs.writeFileSync(screenshotFilename, Buffer.from(data.data.screenshots, 'base64'));

data.data.screenshots = screenshotFilename;
Expand Down Expand Up @@ -203,9 +218,9 @@ async function run(inputUrls, outputPath, verbose, logPath, numberOfCrawlers, da
}

// @ts-ignore
const config = crawlConfig.figureOut(program);
const config = crawlConfig.figureOut(program.opts());
const collectorFlags = {
autoconsentAction: program.autoconsentAction,
autoconsentAction: program.opts().autoconsentAction,
};
/**
* @type {BaseCollector[]}
Expand Down Expand Up @@ -259,3 +274,11 @@ if (!config.urls || !config.output) {

run(urls, config.output, config.verbose, config.logPath, config.crawlers || null, dataCollectors, reporters, config.forceOverwrite, config.filterOutFirstParty, config.emulateMobile, config.proxyConfig, config.regionCode, !config.disableAntiBot, config.chromiumVersion, config.maxLoadTimeMs, config.extraExecutionTimeMs, collectorFlags);
}

/**
* @typedef {import('../collectors/BaseCollector')} BaseCollector
*/

/**
* @typedef {import('../reporters/BaseReporter')} BaseReporter
*/
18 changes: 9 additions & 9 deletions collectors/APICallCollector.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ class APICallCollector extends BaseCollector {
}

/**
* @param {{cdpClient: import('puppeteer').CDPSession, url: string, type: import('./TargetCollector').TargetType}} targetInfo
* @param {import('./BaseCollector').TargetInfo} targetInfo
*/
async addTarget({cdpClient, url}) {
const trackerTracker = new TrackerTracker(cdpClient.send.bind(cdpClient));
async addTarget({session, url}) {
const trackerTracker = new TrackerTracker(session.send.bind(session));
trackerTracker.setMainURL(url.toString());

cdpClient.on('Debugger.scriptParsed', this.onScriptParsed.bind(this, trackerTracker));
cdpClient.on('Debugger.paused', this.onDebuggerPaused.bind(this, trackerTracker));
cdpClient.on('Runtime.executionContextCreated', this.onExecutionContextCreated.bind(this, trackerTracker, cdpClient));
cdpClient.on('Runtime.bindingCalled', this.onBindingCalled.bind(this, trackerTracker));
await cdpClient.send('Runtime.addBinding', {name: 'registerAPICall'});
session.on('Debugger.scriptParsed', this.onScriptParsed.bind(this, trackerTracker));
session.on('Debugger.paused', this.onDebuggerPaused.bind(this, trackerTracker));
session.on('Runtime.executionContextCreated', this.onExecutionContextCreated.bind(this, trackerTracker, session));
session.on('Runtime.bindingCalled', this.onBindingCalled.bind(this, trackerTracker));
await session.send('Runtime.addBinding', {name: 'registerAPICall'});

try {
await trackerTracker.init({log: this._log});
Expand All @@ -47,7 +47,7 @@ class APICallCollector extends BaseCollector {

/**
* @param {TrackerTracker} trackerTracker
* @param {import('puppeteer').CDPSession} cdpClient
* @param {import('puppeteer-core/lib/cjs/puppeteer/common/Connection').CDPSession} cdpClient
* @param {import('devtools-protocol/types/protocol').Protocol.Runtime.ExecutionContextCreatedEvent} params
*/
async onExecutionContextCreated(trackerTracker, cdpClient, params) {
Expand Down
Loading
Loading