diff --git a/collectors/CMPCollector.js b/collectors/CMPCollector.js index 611482c8..7bff83f1 100644 --- a/collectors/CMPCollector.js +++ b/collectors/CMPCollector.js @@ -1,6 +1,5 @@ /* eslint-disable max-lines */ const fs = require('fs'); -const createDeferred = require('../helpers/deferred'); const waitFor = require('../helpers/waitFor'); const BaseCollector = require('./BaseCollector'); @@ -15,12 +14,12 @@ const BaseCollector = require('./BaseCollector'); * @typedef { import('@duckduckgo/autoconsent/lib/messages').OptOutResultMessage } OptOutResultMessage * @typedef { import('@duckduckgo/autoconsent/lib/messages').OptInResultMessage } OptInResultMessage * @typedef { import('@duckduckgo/autoconsent/lib/messages').DoneMessage } DoneMessage - * @typedef { { snippets: string[], patterns: string[] } } ScanResult + * @typedef { { snippets: Set, patterns: Set, filterListMatched: boolean } } ScanResult */ // @ts-ignore const baseContentScript = fs.readFileSync( - require.resolve('@duckduckgo/autoconsent/dist/autoconsent.playwright.js'), + require.resolve('../node_modules/@duckduckgo/autoconsent/dist/autoconsent.playwright.js'), 'utf8' ); @@ -46,32 +45,6 @@ function isIgnoredEvalError(e) { ); } -// TODO: check for false positive detections per pattern -const DETECT_PATTERNS = [ - /accept cookies/ig, - /accept all/ig, - /reject all/ig, - /only necessary cookies/ig, // "only necessary" is probably too broad - /by clicking.*(accept|agree|allow)/ig, - /by continuing/ig, - /we (use|serve)( optional)? cookies/ig, - /we are using cookies/ig, - /use of cookies/ig, - /(this|our) (web)?site.*cookies/ig, - /cookies (and|or) .* technologies/ig, - /such as cookies/ig, - /read more about.*cookies/ig, - /consent to.*cookies/ig, - /we and our partners.*cookies/ig, - /we.*store.*information.*such as.*cookies/ig, - /store and\/or access information.*on a device/ig, - /personalised ads and content, ad and content measurement/ig, - - // it might be tempting to add the patterns below, but they cause too many false positives. Don't do it :) - // /cookies? settings/i, - // /cookies? preferences/i, -]; - class CMPCollector extends BaseCollector { id() { return 'cmps'; @@ -88,12 +61,12 @@ class CMPCollector extends BaseCollector { this.receivedMsgs = []; this.selfTestFrame = null; this.isolated2pageworld = new Map(); - this.pendingScan = createDeferred(); this.context = options.context; /** @type {ScanResult} */ this.scanResult = { - snippets: [], - patterns: [], + snippets: new Set([]), + patterns: new Set([]), + filterListMatched: false, }; } @@ -189,10 +162,12 @@ class CMPCollector extends BaseCollector { /** @type {Partial} */ const autoconsentConfig = { enabled: true, - autoAction: null, // we request action explicitly later + autoAction: 'optOut', disabledCmps: [], enablePrehide: false, enableCosmeticRules: true, + enableFilterList: true, + enableHeuristicDetection: true, detectRetries: 20, isMainWorld: false }; @@ -203,14 +178,14 @@ class CMPCollector extends BaseCollector { break; } case 'popupFound': - if (this.autoAction) { - await this.pendingScan.promise; // wait for the pattern detection first - await this._cdpClient.send('Runtime.evaluate', { - expression: `autoconsentReceiveMessage({ type: "${this.autoAction}" })`, - contextId: executionContextId, - }); + if (msg.cmp === 'filterList') { + this.scanResult.filterListMatched = true; } break; + case 'report': + msg.state.heuristicPatterns.forEach(x => this.scanResult.patterns.add(x)); + msg.state.heuristicSnippets.forEach(x => this.scanResult.snippets.add(x)); + break; case 'optInResult': case 'optOutResult': { if (msg.scheduleSelfTest) { @@ -315,44 +290,6 @@ class CMPCollector extends BaseCollector { } } - async postLoad() { - /** - * @type {string[]} - */ - const foundPatterns = []; - const foundSnippets = []; - const pages = await this.context.pages(); - if (pages.length > 0) { - const page = pages[0]; - /** - * @type {Promise[]} - */ - const promises = []; - page.frames().forEach(frame => { - // eslint-disable-next-line no-undef - promises.push(frame.evaluate(() => document.documentElement.innerText).catch(reason => { - this.log(`error retrieving text: ${reason}`); - // ignore exceptions - return ''; - })); - }); - const texts = await Promise.all(promises); - const allTexts = texts.join('\n'); - for (const p of DETECT_PATTERNS) { - const matches = allTexts.match(p); - if (matches) { - foundPatterns.push(p.toString()); - foundSnippets.push(...matches.map(m => m.substring(0, 200))); - } - } - } - this.pendingScan.resolve(); - this.scanResult = { - patterns: foundPatterns, - snippets: Array.from(new Set(foundSnippets)), - }; - } - /** * @returns {CMPResult[]} */ @@ -394,8 +331,9 @@ class CMPCollector extends BaseCollector { succeeded: false, selfTestFail: Boolean(selfTestResult && !selfTestResult.result), errors, - patterns: [], - snippets: [], + patterns: Array.from(this.scanResult.patterns), + snippets: Array.from(this.scanResult.snippets), + filterListMatched: this.scanResult.filterListMatched, }; const found = this.findMessage({type: 'popupFound', cmp: msg.cmp}); @@ -427,25 +365,19 @@ class CMPCollector extends BaseCollector { async getData() { await this.waitForFinish(); const results = this.collectResults(); - if (this.scanResult.patterns.length > 0) { - if (results.length > 0) { - results.forEach(r => { - r.patterns = this.scanResult.patterns; - r.snippets = this.scanResult.snippets; - }); - } else { - results.push({ - final: false, - name: '', - open: false, - started: false, - succeeded: false, - selfTestFail: false, - errors: [], - patterns: this.scanResult.patterns, - snippets: this.scanResult.snippets, - }); - } + if (this.scanResult.patterns.size > 0 && results.length === 0) { + results.push({ + final: false, + name: '', + open: false, + started: false, + succeeded: false, + selfTestFail: false, + errors: [], + patterns: Array.from(this.scanResult.patterns), + snippets: Array.from(this.scanResult.snippets), + filterListMatched: this.scanResult.filterListMatched, + }); } return results; } @@ -462,6 +394,7 @@ class CMPCollector extends BaseCollector { * @property {string[]} errors * @property {string[]} patterns * @property {string[]} snippets + * @property {boolean} filterListMatched */ module.exports = CMPCollector; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 7f068427..b5c065fe 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "1.0.0", "license": "Apache 2.0", "dependencies": { - "@duckduckgo/autoconsent": "^10.15.0", + "@duckduckgo/autoconsent": "^12.0.0", "async": "^2.6.1", "chalk": "^2.4.1", "clickhouse": "^2.6.0", @@ -40,11 +40,14 @@ } }, "node_modules/@duckduckgo/autoconsent": { - "version": "10.15.0", - "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-10.15.0.tgz", - "integrity": "sha512-Jxaogy2IuZEEV1+xPyo3c3PnZJmBO6ima/MapF2VolI/IKxXnL+9yYqyydPhSk0ahx42YINA6uIK6zexlKDIkQ==", + "version": "12.0.0", + "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.0.0.tgz", + "integrity": "sha512-ObPv0pE1d8G1Nnj9NtTxvu04mNRnJ8o8cpU1G0YJk1kP2ykipGnlp9wrn9Tkn2fDy5zT8HQqDsOYWEyk3ES1kg==", + "license": "MPL-2.0", "dependencies": { - "tldts-experimental": "^6.1.37" + "@ghostery/adblocker": "^2.0.4", + "@ghostery/adblocker-content": "^2.0.4", + "tldts-experimental": "^6.1.41" } }, "node_modules/@eslint-community/eslint-utils": { @@ -115,6 +118,35 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/@ghostery/adblocker": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker/-/adblocker-2.1.1.tgz", + "integrity": "sha512-FL4yWrpNTCmtbAfeLotUoo94ZyNqHdZpZRo4Qlk0guPzDGcOtW4/c84UzS9D/Z9Z4H3nWSCrW0q38pjwAbDykA==", + "license": "MPL-2.0", + "dependencies": { + "@ghostery/adblocker-content": "^2.1.1", + "@ghostery/adblocker-extended-selectors": "^2.1.1", + "@remusao/guess-url-type": "^1.3.0", + "@remusao/small": "^1.2.1", + "@remusao/smaz": "^1.9.1", + "tldts-experimental": "^6.0.14" + } + }, + "node_modules/@ghostery/adblocker-content": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-content/-/adblocker-content-2.1.1.tgz", + "integrity": "sha512-1DKHmPnlQleXapaL36xZOwwZmpdbjMP/IcWdTTzyriyCDIFlSwBDT1DJ3xg0TK61ahZMEwz1MnTGM6X99z/5rQ==", + "license": "MPL-2.0", + "dependencies": { + "@ghostery/adblocker-extended-selectors": "^2.1.1" + } + }, + "node_modules/@ghostery/adblocker-extended-selectors": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-extended-selectors/-/adblocker-extended-selectors-2.1.1.tgz", + "integrity": "sha512-jEHjU2CarS2MtRYfm/6iTKMS1DVzepuwXSMKg1zTyHl+u4ZKvKNYFK7plD0nUlL5a8akyRkYwLheXnKsW3nChQ==", + "license": "MPL-2.0" + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", @@ -185,6 +217,49 @@ "node": ">= 8" } }, + "node_modules/@remusao/guess-url-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.3.0.tgz", + "integrity": "sha512-SNSJGxH5ckvxb3EUHj4DqlAm/bxNxNv2kx/AESZva/9VfcBokwKNS+C4D1lQdWIDM1R3d3UG+xmVzlkNG8CPTQ==", + "license": "MPL-2.0" + }, + "node_modules/@remusao/small": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.3.0.tgz", + "integrity": "sha512-bydAhJI+ywmg5xMUcbqoR8KahetcfkFywEZpsyFZ8EBofilvWxbXnMSe4vnjDI1Y+SWxnNhR4AL/2BAXkf4b8A==", + "license": "MPL-2.0" + }, + "node_modules/@remusao/smaz": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.10.0.tgz", + "integrity": "sha512-GQzCxmmMpLkyZwcwNgz8TpuBEWl0RUQa8IcvKiYlPxuyYKqyqPkCr0hlHI15ckn3kDUPS68VmTVgyPnLNrdVmg==", + "license": "MPL-2.0", + "dependencies": { + "@remusao/smaz-compress": "^1.10.0", + "@remusao/smaz-decompress": "^1.10.0" + } + }, + "node_modules/@remusao/smaz-compress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.10.0.tgz", + "integrity": "sha512-E/lC8OSU+3bQrUl64vlLyPzIxo7dxF2RvNBe9KzcM4ax43J/d+YMinmMztHyCIHqRbz7rBCtkp3c0KfeIbHmEg==", + "license": "MPL-2.0", + "dependencies": { + "@remusao/trie": "^1.5.0" + } + }, + "node_modules/@remusao/smaz-decompress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.10.0.tgz", + "integrity": "sha512-aA5ImUH480Pcs5/cOgToKmFnzi7osSNG6ft+7DdmQTaQEEst3nLq3JLlBEk+gwidURymjbx6DYs60LHaZ415VQ==", + "license": "MPL-2.0" + }, + "node_modules/@remusao/trie": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.5.0.tgz", + "integrity": "sha512-UX+3utJKgwCsg6sUozjxd38gNMVRXrY4TNX9VvCdSrlZBS1nZjRPi98ON3QjRAdf6KCguJFyQARRsulTeqQiPg==", + "license": "MPL-2.0" + }, "node_modules/@types/async": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/@types/async/-/async-2.4.2.tgz", @@ -3367,11 +3442,13 @@ }, "dependencies": { "@duckduckgo/autoconsent": { - "version": "10.15.0", - "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-10.15.0.tgz", - "integrity": "sha512-Jxaogy2IuZEEV1+xPyo3c3PnZJmBO6ima/MapF2VolI/IKxXnL+9yYqyydPhSk0ahx42YINA6uIK6zexlKDIkQ==", + "version": "12.0.0", + "resolved": "https://registry.npmjs.org/@duckduckgo/autoconsent/-/autoconsent-12.0.0.tgz", + "integrity": "sha512-ObPv0pE1d8G1Nnj9NtTxvu04mNRnJ8o8cpU1G0YJk1kP2ykipGnlp9wrn9Tkn2fDy5zT8HQqDsOYWEyk3ES1kg==", "requires": { - "tldts-experimental": "^6.1.37" + "@ghostery/adblocker": "^2.0.4", + "@ghostery/adblocker-content": "^2.0.4", + "tldts-experimental": "^6.1.41" } }, "@eslint-community/eslint-utils": { @@ -3420,6 +3497,32 @@ "integrity": "sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g==", "dev": true }, + "@ghostery/adblocker": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker/-/adblocker-2.1.1.tgz", + "integrity": "sha512-FL4yWrpNTCmtbAfeLotUoo94ZyNqHdZpZRo4Qlk0guPzDGcOtW4/c84UzS9D/Z9Z4H3nWSCrW0q38pjwAbDykA==", + "requires": { + "@ghostery/adblocker-content": "^2.1.1", + "@ghostery/adblocker-extended-selectors": "^2.1.1", + "@remusao/guess-url-type": "^1.3.0", + "@remusao/small": "^1.2.1", + "@remusao/smaz": "^1.9.1", + "tldts-experimental": "^6.0.14" + } + }, + "@ghostery/adblocker-content": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-content/-/adblocker-content-2.1.1.tgz", + "integrity": "sha512-1DKHmPnlQleXapaL36xZOwwZmpdbjMP/IcWdTTzyriyCDIFlSwBDT1DJ3xg0TK61ahZMEwz1MnTGM6X99z/5rQ==", + "requires": { + "@ghostery/adblocker-extended-selectors": "^2.1.1" + } + }, + "@ghostery/adblocker-extended-selectors": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/@ghostery/adblocker-extended-selectors/-/adblocker-extended-selectors-2.1.1.tgz", + "integrity": "sha512-jEHjU2CarS2MtRYfm/6iTKMS1DVzepuwXSMKg1zTyHl+u4ZKvKNYFK7plD0nUlL5a8akyRkYwLheXnKsW3nChQ==" + }, "@humanwhocodes/config-array": { "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", @@ -3469,6 +3572,43 @@ "fastq": "^1.6.0" } }, + "@remusao/guess-url-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.3.0.tgz", + "integrity": "sha512-SNSJGxH5ckvxb3EUHj4DqlAm/bxNxNv2kx/AESZva/9VfcBokwKNS+C4D1lQdWIDM1R3d3UG+xmVzlkNG8CPTQ==" + }, + "@remusao/small": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.3.0.tgz", + "integrity": "sha512-bydAhJI+ywmg5xMUcbqoR8KahetcfkFywEZpsyFZ8EBofilvWxbXnMSe4vnjDI1Y+SWxnNhR4AL/2BAXkf4b8A==" + }, + "@remusao/smaz": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.10.0.tgz", + "integrity": "sha512-GQzCxmmMpLkyZwcwNgz8TpuBEWl0RUQa8IcvKiYlPxuyYKqyqPkCr0hlHI15ckn3kDUPS68VmTVgyPnLNrdVmg==", + "requires": { + "@remusao/smaz-compress": "^1.10.0", + "@remusao/smaz-decompress": "^1.10.0" + } + }, + "@remusao/smaz-compress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.10.0.tgz", + "integrity": "sha512-E/lC8OSU+3bQrUl64vlLyPzIxo7dxF2RvNBe9KzcM4ax43J/d+YMinmMztHyCIHqRbz7rBCtkp3c0KfeIbHmEg==", + "requires": { + "@remusao/trie": "^1.5.0" + } + }, + "@remusao/smaz-decompress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.10.0.tgz", + "integrity": "sha512-aA5ImUH480Pcs5/cOgToKmFnzi7osSNG6ft+7DdmQTaQEEst3nLq3JLlBEk+gwidURymjbx6DYs60LHaZ415VQ==" + }, + "@remusao/trie": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.5.0.tgz", + "integrity": "sha512-UX+3utJKgwCsg6sUozjxd38gNMVRXrY4TNX9VvCdSrlZBS1nZjRPi98ON3QjRAdf6KCguJFyQARRsulTeqQiPg==" + }, "@types/async": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/@types/async/-/async-2.4.2.tgz", diff --git a/package.json b/package.json index 94064e82..be7afaab 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "typescript": "^4.6.4" }, "dependencies": { - "@duckduckgo/autoconsent": "^10.15.0", + "@duckduckgo/autoconsent": "^12.0.0", "async": "^2.6.1", "chalk": "^2.4.1", "clickhouse": "^2.6.0", diff --git a/reporters/ClickhouseReporter.js b/reporters/ClickhouseReporter.js index 7305ce3b..22fbf6e7 100644 --- a/reporters/ClickhouseReporter.js +++ b/reporters/ClickhouseReporter.js @@ -63,7 +63,8 @@ const TABLE_DEFINITIONS = [ selfTestFail UInt8, errors Array(String), patterns Array(String), - snippets Array(String) + snippets Array(String), + filterListMatched Bool ) ENGINE = MergeTree() PRIMARY KEY(crawlId, pageId, name)`, `CREATE TABLE IF NOT EXISTS ${DB}.apiSavedCalls ( @@ -115,16 +116,13 @@ class ClickhouseReporter extends BaseReporter { } /** - * @param {{verbose: boolean, startTime: Date, urls: number, logPath: string}} options + * @param {{verbose: boolean, startTime: Date, urls: number, logPath: string}} options */ init(options) { this.verbose = options.verbose; this.client = new ClickHouse({url: CLICKHOUSE_SERVER}); this.crawlId = `${new Date().toISOString()}-${os.hostname()}`; this.ready = Promise.all(TABLE_DEFINITIONS.map(stmt => this.client.query(stmt).toPromise())); - if (this.verbose) { - console.log(`Creating crawl ${this.crawlId}`); - } this.queue = { pages: [], requests: [], @@ -143,6 +141,9 @@ class ClickhouseReporter extends BaseReporter { */ createCrawl(name = '', region = '') { this.ready.then(async () => { + if (this.verbose) { + console.log(`Creating crawl ${this.crawlId}`); + } await this.client.insert(`INSERT INTO ${DB}.crawls (crawlId, name, region)`, [{ crawlId: this.crawlId, name, @@ -194,7 +195,20 @@ class ClickhouseReporter extends BaseReporter { this.queue.elements.push([this.crawlId, pageId, data.data.elements.present, data.data.elements.visible]); } if (data.data.cmps) { - const cmpRows = data.data.cmps.map(c => [this.crawlId, pageId, c.name, c.final, c.open, c.started, c.succeeded, c.selfTestFail, c.errors, c.patterns || [], c.snippets || []]); + const cmpRows = data.data.cmps.map(c => [ + this.crawlId, + pageId, + c.name, + c.final, + c.open, + c.started, + c.succeeded, + c.selfTestFail, + c.errors, + c.patterns || [], + c.snippets || [], + c.filterListMatched || false, + ]); this.queue.cmps = this.queue.cmps.concat(cmpRows); } if (data.data.apis) {