Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cookie popup blocking via adblock-rs #187

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -3,9 +3,15 @@ ARG BROWSER_VERSION=105

FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION}

# TODO: Move this into base image
RUN apt-get update && apt-get install -y jq

ENV RUSTUP_HOME=/rust
ENV CARGO_HOME=/cargo
ENV PATH=/cargo/bin:/rust/bin:$PATH

RUN echo "(curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly --no-modify-path)" > /install-rust.sh && chmod 755 /install-rust.sh
RUN /install-rust.sh

# needed to add args to main build stage
ARG BROWSER_VERSION

@@ -36,6 +42,9 @@ RUN mkdir -p /tmp/ads && cd /tmp/ads && \
cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \
rm /tmp/ads/ad-hosts.txt

# Add cookie popup blocklist
RUN curl -vs -o /app/easylist-cookies.txt https://secure.fanboy.co.nz/fanboy-cookiemonster.txt

RUN yarn install

ADD *.js /app/
28 changes: 15 additions & 13 deletions crawler.js
Original file line number Diff line number Diff line change
@@ -27,7 +27,7 @@ import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI }

import { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";

import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { BlockRules } from "./util/blockrules.js";

// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
@@ -98,7 +98,6 @@ export class Crawler {
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");

this.blockRules = null;
this.adBlockRules = null;

this.errorCount = 0;

@@ -501,6 +500,10 @@ export class Crawler {
}
}

blockEnabled() {
return (this.params.blockRules && this.params.blockRules.length) || this.params.blockAds || this.params.blockCookiePopups;
}

async serializeAndExit() {
await this.serializeConfig();
process.exit(0);
@@ -577,12 +580,15 @@ export class Crawler {

await this.initPages();

if (this.params.blockAds) {
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
}

if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
if (this.blockEnabled()) {
this.blockRules = new BlockRules(
this.params.blockRules,
this.captureBasePrefix,
this.params.blockMessage,
this.params.blockAds,
this.params.adBlockMessage,
this.params.blockCookiePopups,
(text) => this.debugLog(text));
}

this.screencaster = this.initScreenCaster();
@@ -760,11 +766,7 @@ export class Crawler {
}
}

if (this.adBlockRules) {
await this.adBlockRules.initPage(page);
}

if (this.blockRules) {
if (this.blockEnabled()) {
await this.blockRules.initPage(page);
}

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -12,11 +12,13 @@
},
"dependencies": {
"abort-controller": "^3.0.0",
"adblock-rs": "^0.5.8",
"browsertrix-behaviors": "^0.3.4",
"get-folder-size": "^4.0.0",
"ioredis": "^4.27.1",
"js-yaml": "^4.1.0",
"minio": "7.0.26",
"neon-cli": "^0.10.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "^17.1.2",
"request": "^2.88.2",
7 changes: 7 additions & 0 deletions util/argParser.js
Original file line number Diff line number Diff line change
@@ -118,6 +118,13 @@ class ArgParser {
default: false,
},

"blockCookiePopups": {
alias: "blockcookiepopups",
describe: "If set, block cookie, GDPR, and privacy notice pop-ups (based on EasyList Cookie List)",
type: "boolean",
default: false,
},

"adBlockMessage": {
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
type: "string",
112 changes: 58 additions & 54 deletions util/blockrules.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import fs from "fs";
import * as AdBlockClient from "adblock-rs";

const RULE_TYPES = ["block", "allowOnly"];

@@ -9,7 +10,8 @@ const BlockState = {
BLOCK_PAGE_NAV: "page",
BLOCK_IFRAME_NAV: "iframe",
BLOCK_OTHER: "resource",
BLOCK_AD: "advertisement"
BLOCK_AD: "advertisement",
BLOCK_COOKIE_POPUP: "cookie pop-up"
};


@@ -47,12 +49,23 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// ===========================================================================
export class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
constructor(blockRules, blockPutUrl, blockErrMsg, blockAds, adBlockErrMsg, blockCookiePopups, debugLog) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.blockAds = blockAds;
this.adBlockErrMsg = adBlockErrMsg;
this.blockCookiePopups = blockCookiePopups;
this.debugLog = debugLog;

this.adhosts = JSON.parse(fs.readFileSync(new URL("../ad-hosts.json", import.meta.url)));
if (this.blockCookiePopups) {
const easylistRules = fs.readFileSync(new URL("../easylist-cookies.txt", import.meta.url), { encoding: "utf-8" }).split("\n");
const filterSet = new AdBlockClient.FilterSet(true);
filterSet.addFilters(easylistRules);
this.cookieBlockClient = new AdBlockClient.Engine(filterSet, true);
}

this.blockedUrlSet = new Set();

for (const ruleData of blockRules) {
@@ -68,10 +81,6 @@ export class BlockRules
}

async initPage(page) {
if (!this.rules.length) {
return;
}

if (page._btrix_interceptionAdded) {
return true;
}
@@ -95,14 +104,25 @@ export class BlockRules
let blockState;

try {
blockState = await this.shouldBlock(request, url);

if (blockState === BlockState.ALLOW) {
await request.continue();
} else {
await request.abort("blockedbyclient");
if (this.blockAds) {
blockState = await this.shouldBlockAd(request, url);
if (blockState === BlockState.BLOCK_AD) {
return await request.abort("blockedbyclient");
}
}

if (this.blockCookiePopups) {
blockState = await this.shouldBlockCookiePopup(request, url);
if (blockState === BlockState.BLOCK_COOKIE_POPUP) {
return await request.abort("blockedbyclient");
}
}
if (this.rules.length) {
blockState = await this.shouldBlock(request, url);
if (blockState !== BlockState.ALLOW) {
return await request.abort("blockedbyclient");
}
}
await request.continue();
} catch (e) {
this.debugLog(`Block: (${blockState}) Failed On: ${url} Reason: ${e}`);
}
@@ -208,7 +228,27 @@ export class BlockRules
}
}

async recordBlockMsg(url) {
async shouldBlockAd(request, url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url, true);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}

async shouldBlockCookiePopup(request, url) {
const checkResult = this.cookieBlockClient.check(url, "", "");
if (checkResult != false) {
this.debugLog(`URL blocked for being a cookie pop-up: ${url}`);
return BlockState.BLOCK_COOKIE_POPUP;
}
return BlockState.ALLOW;
}

async recordBlockMsg(url, ad=false) {
if (this.blockedUrlSet.has(url)) {
return;
}
@@ -219,48 +259,12 @@ export class BlockRules
return;
}

const body = this.blockErrMsg;
let body = this.blockErrMsg;
if (ad) {
body = this.adBlockErrMessage;
}
const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
}
}


// ===========================================================================
export class AdBlockRules extends BlockRules
{
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg, debugLog);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
}

async initPage(page) {
if (page._btrix_adInterceptionAdded) {
return true;
}

page._btrix_adInterceptionAdded = true;

await page.setRequestInterception(true);

page.on("request", async (request) => {
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
}
});
}

async shouldBlock(request, url) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}
return BlockState.ALLOW;
}
}
357 changes: 350 additions & 7 deletions yarn.lock

Large diffs are not rendered by default.