diff --git a/README.md b/README.md index 9cd93fe..48ee982 100644 --- a/README.md +++ b/README.md @@ -20,13 +20,13 @@ If you are interested in running it locally you can clone this repository and fo ## Usage -`node example.js`. +`npm run example`. Results are stored in `demo-dir` by default ## Collector configuration -`collector` takes the following arguments: +`collect` takes the following arguments: - `inUrl` **required** - The URL you want to scrape @@ -118,39 +118,6 @@ Results are stored in `demo-dir` by default - _TIP:_ Firefox lets you import a HAR file and visualize it using the network tab in the developer tools. - You can also view it [here](https://toolbox.googleapps.com/apps/har_analyzer/). -``` -const { collector } = require("@themarkup/blacklight-collector"); -const { join } = require("path"); - -(async () => { - const EMULATE_DEVICE = false; - - // Save the results to a folder - let OUT_DIR = true; - - // The URL to test - const URL = "jetblue.com"; - - const defaultConfig = { - inUrl: `http://${URL}`, - numPages: 2, - headless: false, - emulateDevice: EMULATE_DEVICE - }; - - const result = await collector( - OUT_DIR - ? { ...defaultConfig, ...{ outDir: join(__dirname, "demo-dir") } } - : defaultConfig - ); - if (OUT_DIR) { - console.log( - `For captured data please look in ${join(__dirname, "demo-dir")}` - ); - } -})(); - -``` Blacklight would not be possible without the work of [OpenWPM](https://github.com/mozilla/OpenWPM) and the EU-EDPS's [website evidence collector](https://github.com/EU-EDPS/website-evidence-collector) diff --git a/__tests__/collector.ts b/__tests__/collector.ts index 6e731a7..17beb20 100644 --- a/__tests__/collector.ts +++ b/__tests__/collector.ts @@ -106,8 +106,7 @@ it("can get social links", async () => { jest.setTimeout(15000); it.skip("Considers first party domains to be those from the domain requested or the domain of the page loaded after redirects", async () => { const URL = "https://nyt.com"; - const response = await collector({ - inUrl: URL, + const response = await collector(URL, { numPages: 1, defaultWaitUntil: "domcontentloaded", }); @@ -122,8 +121,7 @@ it.skip("Considers first party domains to be those from the domain requested or it.skip("If a user enters a url with a subdomain blacklight will only browse to other pages in that subdomain", async () => { const URL = "https://jobs.theguardian.com"; - const response = await collector({ - inUrl: URL, + const response = await collector(URL, { numPages: 1, defaultWaitUntil: "domcontentloaded", }); @@ -139,8 +137,7 @@ it.skip("If a user enters a url with a subdomain blacklight will only browse to it.skip("only exception to the subdomain rule is www", async () => { const URL = "https://www.themarkup.org"; - const response = await collector({ - inUrl: URL, + const response = await collector(URL, { numPages: 1, defaultWaitUntil: "domcontentloaded", }); diff --git a/example.js b/example.js deleted file mode 100644 index 1b6b081..0000000 --- a/example.js +++ /dev/null @@ -1,21 +0,0 @@ -const { collector } = require('./build'); -const { join } = require('path'); - -(async () => { - const URL = 'example.com'; - const EMULATE_DEVICE = 'iPhone 13 Mini'; - - const config = { - inUrl: `http://${URL}`, - numPages: 3, - headless: true, - emulateDevice: EMULATE_DEVICE, - outDir: join(__dirname, 'demo-dir') - }; - - console.log(`Beginning scan of ${URL}`); - - await collector(config); - - console.log(`Scan complete: ${config.outDir}`); -})(); diff --git a/example.ts b/example.ts new file mode 100644 index 0000000..baf3d87 --- /dev/null +++ b/example.ts @@ -0,0 +1,27 @@ + +import { KnownDevices } from "puppeteer"; +import { CollectorOptions, collect } from "./src"; +import { join } from 'path'; + +(async () => { + const URL = 'example.com'; + const EMULATE_DEVICE = 'iPhone 13 Mini'; + + const config: CollectorOptions = { + numPages: 3, + headless: false, + //emulateDevice: KnownDevices[EMULATE_DEVICE], + // Uncomment to run with desktop/laptop browser + // emulateDevice: { + // viewport: {height: 1440, width: 800}, + // userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" + // }, + outDir: join(__dirname, 'demo-dir'), + }; + + console.log(`Beginning scan of ${URL}`); + + await collect(`http://${URL}`, config); + + console.log(`Scan complete: ${config.outDir}`); +})(); diff --git a/package.json b/package.json index 859a35b..f8fa533 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "build": "run-s clean && run-s build:main", "build:main": "tsc -p tsconfig.json", "build:watch": "tsc --watch", + "example": "ts-node example.ts", "test": "jest", "test-server": "node ./config/test-server.js", "fix": "run-s fix:*", diff --git a/src/collector.ts b/src/collector.ts index 7cb750b..2fd7cb5 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -2,7 +2,7 @@ import { writeFileSync } from 'fs'; import sampleSize from 'lodash.samplesize'; import os from 'os'; import { join } from 'path'; -import puppeteer, { Browser, Page, PuppeteerLifeCycleEvent, KnownDevices } from 'puppeteer'; +import puppeteer, { Browser, Page, PuppeteerLifeCycleEvent, KnownDevices, PuppeteerLaunchOptions } from 'puppeteer'; import PuppeteerHar from 'puppeteer-har'; import { getDomain, getSubdomain, parse } from 'tldts'; import url from 'url'; @@ -17,23 +17,25 @@ import { autoScroll, fillForms } from './pptr-utils/interaction-utils'; import { setupSessionRecordingInspector } from './session-recording'; import { setUpThirdPartyTrackersInspector } from './third-party-trackers'; import { clearDir } from './utils'; -export const collector = async ({ - inUrl, - outDir = join(process.cwd(), 'bl-tmp'), - headless = true, - title = 'Blacklight Inspection', - emulateDevice = 'iPhone 13 Mini', - captureHar = true, - captureLinks = false, - enableAdBlock = false, - clearCache = true, - quiet = true, - defaultTimeout = 30000, - numPages = 3, - defaultWaitUntil = 'networkidle2', - saveBrowserProfile = false, - saveScreenshots = true, - blTests = [ + +export type CollectorOptions = Partial; + +const DEFAULT_OPTIONS = { + outDir: join(process.cwd(), 'bl-tmp'), + title: 'Blacklight Inspection', + emulateDevice: KnownDevices['iPhone 13 Mini'], + captureHar: true, + captureLinks: false, + enableAdBlock: false, + clearCache: true, + quiet: true, + headless: true, + defaultTimeout: 30000, + numPages: 3, + defaultWaitUntil: 'networkidle2' as PuppeteerLifeCycleEvent, + saveBrowserProfile: false, + saveScreenshots: true, + blTests: [ 'behaviour_event_listeners', 'canvas_fingerprinters', 'canvas_font_fingerprinters', @@ -43,28 +45,31 @@ export const collector = async ({ 'session_recorders', 'third_party_trackers' ], - puppeteerExecutablePath = null, - extraChromiumArgs = [] -}) => { - clearDir(outDir); + puppeteerExecutablePath: null as string|null, + extraChromiumArgs: [] as string[], + extraPuppeteerOptions: {} as Partial +} + +export const collect = async (inUrl: string, args: CollectorOptions) => { + args = { ...DEFAULT_OPTIONS, ...args }; + clearDir(args.outDir); const FIRST_PARTY = parse(inUrl); let REDIRECTED_FIRST_PARTY = parse(inUrl); - const logger = getLogger({ outDir, quiet }); + const logger = getLogger({ outDir: args.outDir, quiet: args.quiet }); const output: any = { - title, + args: args.title, uri_ins: inUrl, uri_dest: null, uri_redirects: null, secure_connection: {}, host: url.parse(inUrl).hostname, config: { - clearCache, - captureHar, - captureLinks, - enableAdBlock, - emulateDevice, - numPages + cleareCache: args.clearCache, + captureHar: args.captureHar, + captureLinks: args.captureLinks, + enableAdBlock: args.enableAdBlock, + numPages: args.numPages }, browser: null, script: { @@ -78,9 +83,6 @@ export const collector = async ({ start_time: new Date(), end_time: null }; - if (emulateDevice) { - output.deviceEmulated = KnownDevices[emulateDevice]; - } // Log network requests and page links const hosts = { @@ -100,17 +102,17 @@ export const collector = async ({ let har = {} as any; let page_response = null; let loadError = false; - const userDataDir = saveBrowserProfile ? join(outDir, 'browser-profile') : undefined; + const userDataDir = args.saveBrowserProfile ? join(args.outDir, 'browser-profile') : undefined; let didBrowserDisconnect = false; const options = { ...defaultPuppeteerBrowserOptions, - args: [...defaultPuppeteerBrowserOptions.args, ...extraChromiumArgs], - headless, + args: [...defaultPuppeteerBrowserOptions.args, ...args.extraChromiumArgs], + headless: args.headless, userDataDir }; - if (puppeteerExecutablePath) { - options['executablePath'] = puppeteerExecutablePath; + if (args.puppeteerExecutablePath) { + options['executablePath'] = args.puppeteerExecutablePath; } browser = await puppeteer.launch(options); browser.on('disconnected', () => { @@ -134,10 +136,8 @@ export const collector = async ({ version: os.release() } }; - if (emulateDevice) { - const deviceOptions = KnownDevices[emulateDevice]; - page.emulate(deviceOptions); - } + page.emulate(args.emulateDevice); + // record all requested hosts await page.on('request', request => { const l = parse(request.url()); @@ -151,7 +151,7 @@ export const collector = async ({ } }); - if (clearCache) { + if (args.clearCache) { await clearCookiesCache(page); } @@ -160,12 +160,12 @@ export const collector = async ({ await setupKeyLoggingInspector(page, logger.warn); await setupHttpCookieCapture(page, logger.warn); await setupSessionRecordingInspector(page, logger.warn); - await setUpThirdPartyTrackersInspector(page, logger.warn, enableAdBlock); + await setUpThirdPartyTrackersInspector(page, logger.warn, args.enableAdBlock); - if (captureHar) { + if (args.captureHar) { har = new PuppeteerHar(page); await har.start({ - path: outDir ? join(outDir, 'requests.har') : undefined + path: args.outDir ? join(args.outDir, 'requests.har') : undefined }); } if (didBrowserDisconnect) { @@ -176,10 +176,10 @@ export const collector = async ({ } // Go to the url page_response = await page.goto(inUrl, { - timeout: defaultTimeout, - waitUntil: defaultWaitUntil as PuppeteerLifeCycleEvent + timeout: args.defaultTimeout, + waitUntil: args.defaultWaitUntil as PuppeteerLifeCycleEvent }); - await savePageContent(pageIndex, outDir, page, saveScreenshots); + await savePageContent(pageIndex, args.outDir, page, args.saveScreenshots); pageIndex++; let duplicatedLinks = []; @@ -194,8 +194,8 @@ export const collector = async ({ if (typeof userDataDir !== 'undefined') { clearDir(userDataDir, false); } - if (outDir.includes('bl-tmp')) { - clearDir(outDir, false); + if (args.outDir.includes('bl-tmp')) { + clearDir(args.outDir, false); } return { status: 'failed', page_response }; } @@ -232,7 +232,7 @@ export const collector = async ({ } else { subDomainLinks = outputLinks.first_party; } - const browse_links = sampleSize(subDomainLinks, numPages); + const browse_links = sampleSize(subDomainLinks, args.numPages); output.browsing_history = [output.uri_dest].concat(browse_links.map(l => l.href)); for (const link of output.browsing_history.slice(1)) { @@ -244,19 +244,19 @@ export const collector = async ({ }; } await page.goto(link, { - timeout: defaultTimeout, + timeout: args.defaultTimeout, waitUntil: 'networkidle2' }); - await savePageContent(pageIndex, outDir, page, saveScreenshots); + await savePageContent(pageIndex, args.outDir, page, args.saveScreenshots); await fillForms(page); await page.waitForTimeout(800); pageIndex++; duplicatedLinks = duplicatedLinks.concat(await getLinks(page)); await autoScroll(page); } - await captureBrowserCookies(page, outDir); - if (captureHar) { + await captureBrowserCookies(page, args.outDir); + if (args.captureHar) { await har.stop(); } @@ -291,7 +291,7 @@ export const collector = async ({ } }; - if (captureLinks) { + if (args.captureLinks) { output.links = outputLinks; output.social = getSocialLinks(links); } @@ -334,15 +334,15 @@ export const collector = async ({ }); // We only consider something to be a third party tracker if: // The domain is different to that of the final url (after any redirection) of the page the user requested to load. - const reports = blTests.reduce((acc, cur) => { - acc[cur] = generateReport(cur, event_data, outDir, REDIRECTED_FIRST_PARTY.domain); + const reports = args.blTests.reduce((acc, cur) => { + acc[cur] = generateReport(cur, event_data, args.outDir, REDIRECTED_FIRST_PARTY.domain); return acc; }, {}); const json_dump = JSON.stringify({ ...output, reports }, null, 2); - writeFileSync(join(outDir, 'inspection.json'), json_dump); - if (outDir.includes('bl-tmp')) { - clearDir(outDir, false); + writeFileSync(join(args.outDir, 'inspection.json'), json_dump); + if (args.outDir.includes('bl-tmp')) { + clearDir(args.outDir, false); } return { status: 'success', ...output, reports }; }; diff --git a/src/index.ts b/src/index.ts index 052447e..6e81989 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,3 +1 @@ -import { collector } from './collector'; - -export { collector }; +export { collect, CollectorOptions } from './collector';