Skip to content

Commit

Permalink
Rename collector() to collect(), add types, and convert example.js to…
Browse files Browse the repository at this point in the history
… ts (#57)

* Add types for collector arguments and convert example.js to ts
* Update README to reflect collector changes
* Rename collector to collect
* Pass device type instead of string
  • Loading branch information
simonft authored Jun 15, 2023
1 parent 31d3de7 commit 218daca
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 127 deletions.
37 changes: 2 additions & 35 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ If you are interested in running it locally you can clone this repository and fo

## Usage

`node example.js`.
`npm run example`.

Results are stored in `demo-dir` by default

## Collector configuration

`collector` takes the following arguments:
`collect` takes the following arguments:

- `inUrl` **required**
- The URL you want to scrape
Expand Down Expand Up @@ -118,39 +118,6 @@ Results are stored in `demo-dir` by default
- _TIP:_ Firefox lets you import a HAR file and visualize it using the network tab in the developer tools.
- You can also view it [here](https://toolbox.googleapps.com/apps/har_analyzer/).

```
const { collector } = require("@themarkup/blacklight-collector");
const { join } = require("path");
(async () => {
const EMULATE_DEVICE = false;
// Save the results to a folder
let OUT_DIR = true;
// The URL to test
const URL = "jetblue.com";
const defaultConfig = {
inUrl: `http://${URL}`,
numPages: 2,
headless: false,
emulateDevice: EMULATE_DEVICE
};
const result = await collector(
OUT_DIR
? { ...defaultConfig, ...{ outDir: join(__dirname, "demo-dir") } }
: defaultConfig
);
if (OUT_DIR) {
console.log(
`For captured data please look in ${join(__dirname, "demo-dir")}`
);
}
})();
```

Blacklight would not be possible without the work of [OpenWPM](https://github.com/mozilla/OpenWPM)
and the EU-EDPS's [website evidence collector](https://github.com/EU-EDPS/website-evidence-collector)
9 changes: 3 additions & 6 deletions __tests__/collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ it("can get social links", async () => {
jest.setTimeout(15000);
it.skip("Considers first party domains to be those from the domain requested or the domain of the page loaded after redirects", async () => {
const URL = "https://nyt.com";
const response = await collector({
inUrl: URL,
const response = await collector(URL, {
numPages: 1,
defaultWaitUntil: "domcontentloaded",
});
Expand All @@ -122,8 +121,7 @@ it.skip("Considers first party domains to be those from the domain requested or

it.skip("If a user enters a url with a subdomain blacklight will only browse to other pages in that subdomain", async () => {
const URL = "https://jobs.theguardian.com";
const response = await collector({
inUrl: URL,
const response = await collector(URL, {
numPages: 1,
defaultWaitUntil: "domcontentloaded",
});
Expand All @@ -139,8 +137,7 @@ it.skip("If a user enters a url with a subdomain blacklight will only browse to

it.skip("only exception to the subdomain rule is www", async () => {
const URL = "https://www.themarkup.org";
const response = await collector({
inUrl: URL,
const response = await collector(URL, {
numPages: 1,
defaultWaitUntil: "domcontentloaded",
});
Expand Down
21 changes: 0 additions & 21 deletions example.js

This file was deleted.

27 changes: 27 additions & 0 deletions example.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

import { KnownDevices } from "puppeteer";
import { CollectorOptions, collect } from "./src";
import { join } from 'path';

(async () => {
const URL = 'example.com';
const EMULATE_DEVICE = 'iPhone 13 Mini';

const config: CollectorOptions = {
numPages: 3,
headless: false,
//emulateDevice: KnownDevices[EMULATE_DEVICE],
// Uncomment to run with desktop/laptop browser
// emulateDevice: {
// viewport: {height: 1440, width: 800},
// userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
// },
outDir: join(__dirname, 'demo-dir'),
};

console.log(`Beginning scan of ${URL}`);

await collect(`http://${URL}`, config);

console.log(`Scan complete: ${config.outDir}`);
})();
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"build": "run-s clean && run-s build:main",
"build:main": "tsc -p tsconfig.json",
"build:watch": "tsc --watch",
"example": "ts-node example.ts",
"test": "jest",
"test-server": "node ./config/test-server.js",
"fix": "run-s fix:*",
Expand Down
124 changes: 62 additions & 62 deletions src/collector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { writeFileSync } from 'fs';
import sampleSize from 'lodash.samplesize';
import os from 'os';
import { join } from 'path';
import puppeteer, { Browser, Page, PuppeteerLifeCycleEvent, KnownDevices } from 'puppeteer';
import puppeteer, { Browser, Page, PuppeteerLifeCycleEvent, KnownDevices, PuppeteerLaunchOptions } from 'puppeteer';
import PuppeteerHar from 'puppeteer-har';
import { getDomain, getSubdomain, parse } from 'tldts';
import url from 'url';
Expand All @@ -17,23 +17,25 @@ import { autoScroll, fillForms } from './pptr-utils/interaction-utils';
import { setupSessionRecordingInspector } from './session-recording';
import { setUpThirdPartyTrackersInspector } from './third-party-trackers';
import { clearDir } from './utils';
export const collector = async ({
inUrl,
outDir = join(process.cwd(), 'bl-tmp'),
headless = true,
title = 'Blacklight Inspection',
emulateDevice = 'iPhone 13 Mini',
captureHar = true,
captureLinks = false,
enableAdBlock = false,
clearCache = true,
quiet = true,
defaultTimeout = 30000,
numPages = 3,
defaultWaitUntil = 'networkidle2',
saveBrowserProfile = false,
saveScreenshots = true,
blTests = [

export type CollectorOptions = Partial<typeof DEFAULT_OPTIONS>;

const DEFAULT_OPTIONS = {
outDir: join(process.cwd(), 'bl-tmp'),
title: 'Blacklight Inspection',
emulateDevice: KnownDevices['iPhone 13 Mini'],
captureHar: true,
captureLinks: false,
enableAdBlock: false,
clearCache: true,
quiet: true,
headless: true,
defaultTimeout: 30000,
numPages: 3,
defaultWaitUntil: 'networkidle2' as PuppeteerLifeCycleEvent,
saveBrowserProfile: false,
saveScreenshots: true,
blTests: [
'behaviour_event_listeners',
'canvas_fingerprinters',
'canvas_font_fingerprinters',
Expand All @@ -43,28 +45,31 @@ export const collector = async ({
'session_recorders',
'third_party_trackers'
],
puppeteerExecutablePath = null,
extraChromiumArgs = []
}) => {
clearDir(outDir);
puppeteerExecutablePath: null as string|null,
extraChromiumArgs: [] as string[],
extraPuppeteerOptions: {} as Partial<PuppeteerLaunchOptions>
}

export const collect = async (inUrl: string, args: CollectorOptions) => {
args = { ...DEFAULT_OPTIONS, ...args };
clearDir(args.outDir);
const FIRST_PARTY = parse(inUrl);
let REDIRECTED_FIRST_PARTY = parse(inUrl);
const logger = getLogger({ outDir, quiet });
const logger = getLogger({ outDir: args.outDir, quiet: args.quiet });

const output: any = {
title,
args: args.title,
uri_ins: inUrl,
uri_dest: null,
uri_redirects: null,
secure_connection: {},
host: url.parse(inUrl).hostname,
config: {
clearCache,
captureHar,
captureLinks,
enableAdBlock,
emulateDevice,
numPages
cleareCache: args.clearCache,
captureHar: args.captureHar,
captureLinks: args.captureLinks,
enableAdBlock: args.enableAdBlock,
numPages: args.numPages
},
browser: null,
script: {
Expand All @@ -78,9 +83,6 @@ export const collector = async ({
start_time: new Date(),
end_time: null
};
if (emulateDevice) {
output.deviceEmulated = KnownDevices[emulateDevice];
}

// Log network requests and page links
const hosts = {
Expand All @@ -100,17 +102,17 @@ export const collector = async ({
let har = {} as any;
let page_response = null;
let loadError = false;
const userDataDir = saveBrowserProfile ? join(outDir, 'browser-profile') : undefined;
const userDataDir = args.saveBrowserProfile ? join(args.outDir, 'browser-profile') : undefined;
let didBrowserDisconnect = false;

const options = {
...defaultPuppeteerBrowserOptions,
args: [...defaultPuppeteerBrowserOptions.args, ...extraChromiumArgs],
headless,
args: [...defaultPuppeteerBrowserOptions.args, ...args.extraChromiumArgs],
headless: args.headless,
userDataDir
};
if (puppeteerExecutablePath) {
options['executablePath'] = puppeteerExecutablePath;
if (args.puppeteerExecutablePath) {
options['executablePath'] = args.puppeteerExecutablePath;
}
browser = await puppeteer.launch(options);
browser.on('disconnected', () => {
Expand All @@ -134,10 +136,8 @@ export const collector = async ({
version: os.release()
}
};
if (emulateDevice) {
const deviceOptions = KnownDevices[emulateDevice];
page.emulate(deviceOptions);
}
page.emulate(args.emulateDevice);

// record all requested hosts
await page.on('request', request => {
const l = parse(request.url());
Expand All @@ -151,7 +151,7 @@ export const collector = async ({
}
});

if (clearCache) {
if (args.clearCache) {
await clearCookiesCache(page);
}

Expand All @@ -160,12 +160,12 @@ export const collector = async ({
await setupKeyLoggingInspector(page, logger.warn);
await setupHttpCookieCapture(page, logger.warn);
await setupSessionRecordingInspector(page, logger.warn);
await setUpThirdPartyTrackersInspector(page, logger.warn, enableAdBlock);
await setUpThirdPartyTrackersInspector(page, logger.warn, args.enableAdBlock);

if (captureHar) {
if (args.captureHar) {
har = new PuppeteerHar(page);
await har.start({
path: outDir ? join(outDir, 'requests.har') : undefined
path: args.outDir ? join(args.outDir, 'requests.har') : undefined
});
}
if (didBrowserDisconnect) {
Expand All @@ -176,10 +176,10 @@ export const collector = async ({
}
// Go to the url
page_response = await page.goto(inUrl, {
timeout: defaultTimeout,
waitUntil: defaultWaitUntil as PuppeteerLifeCycleEvent
timeout: args.defaultTimeout,
waitUntil: args.defaultWaitUntil as PuppeteerLifeCycleEvent
});
await savePageContent(pageIndex, outDir, page, saveScreenshots);
await savePageContent(pageIndex, args.outDir, page, args.saveScreenshots);
pageIndex++;

let duplicatedLinks = [];
Expand All @@ -194,8 +194,8 @@ export const collector = async ({
if (typeof userDataDir !== 'undefined') {
clearDir(userDataDir, false);
}
if (outDir.includes('bl-tmp')) {
clearDir(outDir, false);
if (args.outDir.includes('bl-tmp')) {
clearDir(args.outDir, false);
}
return { status: 'failed', page_response };
}
Expand Down Expand Up @@ -232,7 +232,7 @@ export const collector = async ({
} else {
subDomainLinks = outputLinks.first_party;
}
const browse_links = sampleSize(subDomainLinks, numPages);
const browse_links = sampleSize(subDomainLinks, args.numPages);
output.browsing_history = [output.uri_dest].concat(browse_links.map(l => l.href));

for (const link of output.browsing_history.slice(1)) {
Expand All @@ -244,19 +244,19 @@ export const collector = async ({
};
}
await page.goto(link, {
timeout: defaultTimeout,
timeout: args.defaultTimeout,
waitUntil: 'networkidle2'
});

await savePageContent(pageIndex, outDir, page, saveScreenshots);
await savePageContent(pageIndex, args.outDir, page, args.saveScreenshots);
await fillForms(page);
await page.waitForTimeout(800);
pageIndex++;
duplicatedLinks = duplicatedLinks.concat(await getLinks(page));
await autoScroll(page);
}
await captureBrowserCookies(page, outDir);
if (captureHar) {
await captureBrowserCookies(page, args.outDir);
if (args.captureHar) {
await har.stop();
}

Expand Down Expand Up @@ -291,7 +291,7 @@ export const collector = async ({
}
};

if (captureLinks) {
if (args.captureLinks) {
output.links = outputLinks;
output.social = getSocialLinks(links);
}
Expand Down Expand Up @@ -334,15 +334,15 @@ export const collector = async ({
});
// We only consider something to be a third party tracker if:
// The domain is different to that of the final url (after any redirection) of the page the user requested to load.
const reports = blTests.reduce((acc, cur) => {
acc[cur] = generateReport(cur, event_data, outDir, REDIRECTED_FIRST_PARTY.domain);
const reports = args.blTests.reduce((acc, cur) => {
acc[cur] = generateReport(cur, event_data, args.outDir, REDIRECTED_FIRST_PARTY.domain);
return acc;
}, {});

const json_dump = JSON.stringify({ ...output, reports }, null, 2);
writeFileSync(join(outDir, 'inspection.json'), json_dump);
if (outDir.includes('bl-tmp')) {
clearDir(outDir, false);
writeFileSync(join(args.outDir, 'inspection.json'), json_dump);
if (args.outDir.includes('bl-tmp')) {
clearDir(args.outDir, false);
}
return { status: 'success', ...output, reports };
};
Loading

0 comments on commit 218daca

Please sign in to comment.