From 538964902d149b16b660e501e7fb666bc20c8805 Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Mon, 10 Apr 2023 21:00:29 -0400 Subject: [PATCH 1/2] (Tentative) Move SSL capture to ScoopProxy Implements #138 --- - Removes `crip` dependency, dedicated certificates capture step and associated options. - Intercepts certificate chain at `ScoopProxy` level using `socket.getPeerCertificate()` to assemble a PEM on the fly. Runs once per origin. - Removes duplicate processing of `noarchive` checks --- **Still working through:** The certificates interception currently happens at `ScoopProxy.onResponse()` level. It should be in `ScoopProxy.onConnected()`, but in some cases it appears to be _"too early"_. TBD, but this version works. --- README.md | 4 +- Scoop.js | 167 +++++------------------- Scoop.test.js | 15 ++- assets/templates/provenance-summary.njk | 6 +- bin/cli.js | 14 -- intercepters/ScoopProxy.js | 53 +++++++- options.js | 10 +- options.types.js | 2 - postinstall.sh | 13 -- 9 files changed, 106 insertions(+), 178 deletions(-) diff --git a/README.md b/README.md index b4206495..47dac6dc 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,6 @@ Options: --network-idle-timeout Max time Scoop will wait for the in-browser networking tasks to complete, in ms. (default: 20000) --behaviors-timeout Max time Scoop will wait for the browser behaviors to complete, in ms. (default: 20000) --capture-video-as-attachment-timeout Max time Scoop will wait for the video capture process to complete, in ms. (default: 30000) - --capture-certificates-as-attachment-timeout Max time Scoop will wait for the certificates capture process to complete, in ms. (default: 10000) --capture-window-x Width of the browser window Scoop will open to capture, in pixels. (default: 1600) --capture-window-y Height of the browser window Scoop will open to capture, in pixels. (default: 900) --max-capture-size Size limit for the capture's exchanges list, in bytes. (default: 209715200) @@ -201,7 +200,6 @@ Options: --proxy-verbose Should Scoop's HTTP proxy output logs to the console? (choices: "true", "false", default: "false") --public-ip-resolver-endpoint API endpoint to be used to resolve the client's IP address. Used in the context of the provenance summary. (default: "https://icanhazip.com") --yt-dlp-path Path to the yt-dlp executable. Used for capturing videos. (default: "[library]/executables/yt-dlp") - --crip-path Path to the crip executable. Used for capturing SSL/TLS certificates. (default: "[library]/executables/crip") --log-level Controls Scoop CLI's verbosity. (choices: "silent", "trace", "debug", "info", "warn", "error", default: "info") -h, --help Show options list. ``` @@ -362,7 +360,7 @@ Namely: - Same goes for certificates, captured as attachments via [crip](https://github.com/Hakky54/certificate-ripper). - Favicons may be captured out-of-band using [curl](https://curl.se/), if not intercepted during capture. -Exchanges captured in that context still go through Scoop's HTTP proxy, with the exception of _crip_. +Exchanges captured in that context still go through Scoop's HTTP proxy. ```mermaid flowchart LR diff --git a/Scoop.js b/Scoop.js index 3e954ba0..4c601f6b 100644 --- a/Scoop.js +++ b/Scoop.js @@ -149,16 +149,17 @@ export class Scoop { * cpuArchitecture: ?string, * blockedRequests: Array.<{match: string, rule: string}>, * noArchiveUrls: string[], - * certificates: Array.<{host: string, pem: string}>, + * certificates: Object., * ytDlpHash: string, * cripHash: string, * options: ScoopOptions, * }} + * */ provenanceInfo = { blockedRequests: [], noArchiveUrls: [], - certificates: [] + certificates: {} /// Key: host, Value: PEM string } /** @@ -385,30 +386,6 @@ export class Scoop { }) } - // Push step: noarchive directive detection - // TODO: Move this logic back to ScoopProxy.intercept() when new proxy implementation is ready. - steps.push({ - name: 'Detecting "noarchive" directive', - alwaysRun: true, - webPageOnly: true, - main: async () => { - for (const exchange of this.intercepter.exchanges) { - this.intercepter.checkExchangeForNoArchive(exchange) - } - } - }) - - // Push step: certs capture - if (options.captureCertificatesAsAttachment) { - steps.push({ - name: 'Capturing certificates info', - alwaysRun: options.attachmentsBypassLimits, - main: async () => { - await this.#captureCertificatesAsAttachment() - } - }) - } - // Push step: Provenance summary if (options.provenanceSummary) { steps.push({ @@ -1038,98 +1015,6 @@ export class Scoop { this.addGeneratedExchange(url, httpHeaders, body, isEntryPoint, description) } - /** - * Runs `crip` against the different origins the capture process encountered. - * Captures certificates as `file:///[origin].pem`). - * Populates `this.provenanceInfo.certificates`. - * - * @returns {Promise} - * @private - */ - async #captureCertificatesAsAttachment () { - const { captureCertificatesAsAttachmentTimeout, cripPath } = this.options - - // - // Start timeout timer - // - let timeIsOut = false - const timer = setTimeout(() => { timeIsOut = true }, captureCertificatesAsAttachmentTimeout) - - // - // Check that `crip` is available - // - try { - await exec(cripPath) - } catch (err) { - this.log.trace(err) - throw new Error('"crip" executable is not available or cannot be executed.') - } - - // - // Pull certs - // - const processedHosts = new Map() - - for (const exchange of this.intercepter.exchanges) { - const url = new URL(exchange.url) - - if (timeIsOut) { - throw new Error('Capture certificates at attachment timeout reached') - } - - if (url.protocol !== 'https:' || processedHosts.get(url.host) === true) { - continue - } - - if (this.blocklist.find(searchBlocklistFor(`https://${url.host}`))) { - this.log.warn(`${url.host} matched against blocklist - skipped trying to pull its certificate.`) - continue - } - - try { - const cripOptions = [ - 'print', - '-u', `https://${url.host}`, - '-f', 'pem' - ] - - let timeout = captureCertificatesAsAttachmentTimeout - - if (processedHosts.length > 0) { // Timeout per request decreases as we go through the list. - timeout = captureCertificatesAsAttachmentTimeout / processedHosts.length - } - - const spawnOptions = { - timeout: timeout > 1000 ? timeout : 1000, - maxBuffer: 1024 * 1024 * 128 - } - - const pem = await exec(cripPath, cripOptions, spawnOptions) - - processedHosts.set(url.host, true) - - if (!pem) { - throw new Error(`crip did not return a PEM for ${url.host}.`) - } - - // Add to generated exchanges - const fileUrl = `file:///${url.host}.pem` - const httpHeaders = new Headers({ 'content-type': 'application/x-pem-file' }) - const body = Buffer.from(pem) - const isEntryPoint = false - await this.addGeneratedExchange(fileUrl, httpHeaders, body, isEntryPoint) - - // Add to `this.provenanceInfo.certificates` - this.provenanceInfo.certificates.push({ host: url.host, pem }) - } catch (err) { - this.log.trace(err) - this.log.warn(`Certificates could not be extracted for ${url.host}`) - } - } - - clearTimeout(timer) - } - /** * Populates `this.provenanceInfo`, which is then used to generate a `file:///provenance-summary.html` exchange and entry point. * That property is also be used by `scoopToWACZ()` to populate the `extras` field of `datapackage.json`. @@ -1148,7 +1033,6 @@ export class Scoop { const osInfo = await getOSInfo() const userAgent = await page.evaluate(() => window.navigator.userAgent) // Source user agent from the browser in case it was altered during capture let ytDlpHash = '' - let cripHash = '' // Grab public IP address try { @@ -1183,18 +1067,6 @@ export class Scoop { this.log.trace(err) } - // Compute crip hash - try { - cripHash = createHash('sha256') - .update(await readFile(this.options.cripPath)) - .digest('hex') - - cripHash = `sha256:${cripHash}` - } catch (err) { - this.log.warn('Could not compute SHA256 hash of crip executable') - this.log.trace(err) - } - // Gather provenance info this.provenanceInfo = { ...this.provenanceInfo, @@ -1207,7 +1079,6 @@ export class Scoop { osVersion: osInfo.version, cpuArchitecture: os.machine(), ytDlpHash, - cripHash, options: structuredClone(this.options) } @@ -1235,6 +1106,38 @@ export class Scoop { } } + /** + * Adds an SSL certificate to the capture as: + * - An entry in `provenanceInfo.certificates` + * - A generated exchange (file:///{host}.pem) + * @param {string} host + * @param {string} pem + * @returns {Promise} + */ + async addCertificate (host, pem) { + host = `${host}` + pem = `${pem}` + + if (host.length < 3 || !host.includes('.')) { + throw new Error('"host" must be a valid network host.') + } + + if (!pem.startsWith('-----BEGIN CERTIFICATE-----') || + !pem.endsWith('-----END CERTIFICATE-----\n')) { + throw new Error('"pem" must be a valid certificate.') + } + + // Save as generated exchange + const fileUrl = `file:///${host}.pem` + const httpHeaders = new Headers({ 'content-type': 'application/x-pem-file' }) + const body = Buffer.from(pem) + const isEntryPoint = false + await this.addGeneratedExchange(fileUrl, httpHeaders, body, isEntryPoint) + + // Save in provenance info (if successful) + this.provenanceInfo.certificates[host] = pem + } + /** * Generates a ScoopGeneratedExchange for generated content and adds it to `exchanges`. * Unless `force` argument is passed, generated exchanges count towards time / size limits. diff --git a/Scoop.test.js b/Scoop.test.js index 2e5f2705..16758556 100644 --- a/Scoop.test.js +++ b/Scoop.test.js @@ -7,10 +7,10 @@ import express from 'express' import { FIXTURES_PATH } from './constants.js' import { isPNG, getDimensions } from './utils/png.js' import { isPDF, getPageCount } from './utils/pdf.js' -import { defaults } from './options.js' +import { defaults, testDefaults } from './options.js' import { Scoop } from './Scoop.js' -await test('Scoop - capture of a web page.', async (t) => { +await test('Scoop - capture of a (local) web page.', async (t) => { const app = express() const PORT = 3000 const URL = `http://localhost:${PORT}` @@ -103,7 +103,16 @@ await test('Scoop - capture of a web page.', async (t) => { server.close() }) -await test('Scoop - capture of a non-web resource.', async (t) => { +// Accounts for tests that can't be run locally +await test('Scoop - capture of a (remote) web page.', async (t) => { + await t.test('Scoop captures SSL certificates', async (_t) => { + const capture = await Scoop.capture('https://example.com', testDefaults) + assert(capture.provenanceInfo.certificates['example.com']) + assert(capture.extractGeneratedExchanges()['example.com.pem']) + }) +}) + +await test('Scoop - capture of a (local) non-web resource.', async (t) => { const app = express() const PORT = 3000 const URL = `http://localhost:${PORT}` diff --git a/assets/templates/provenance-summary.njk b/assets/templates/provenance-summary.njk index ae74c362..4616b4e8 100644 --- a/assets/templates/provenance-summary.njk +++ b/assets/templates/provenance-summary.njk @@ -165,14 +165,14 @@ {% endif %} - {% if certificates.length %} + {% if certificates|length %}

SSL/TLS Certificates

The following certificates were pulled by crip from the different origins encountered during capture.

- {% for cert in certificates %} -
  • {{ cert.host }}
  • + {% for host, pem in certificates %} +
  • {{ host }}
  • {% endfor %}
    diff --git a/bin/cli.js b/bin/cli.js index 4587e153..a66cecd8 100755 --- a/bin/cli.js +++ b/bin/cli.js @@ -140,13 +140,6 @@ program.addOption( .default(defaults.captureVideoAsAttachmentTimeout) ) -program.addOption( - new Option( - '--capture-certificates-as-attachment-timeout ', - 'Max time Scoop will wait for the certificates capture process to complete, in ms.') - .default(defaults.captureCertificatesAsAttachmentTimeout) -) - // // Dimensions // @@ -283,13 +276,6 @@ program.addOption( .default(defaults.ytDlpPath) ) -program.addOption( - new Option( - '--crip-path ', - 'Path to the crip executable. Used for capturing SSL/TLS certificates.') - .default(defaults.cripPath) -) - program.addOption( new Option('--log-level ', 'Controls Scoop CLI\'s verbosity.') .choices(['silent', 'trace', 'debug', 'info', 'warn', 'error']) diff --git a/intercepters/ScoopProxy.js b/intercepters/ScoopProxy.js index d67575c1..f798a4ea 100644 --- a/intercepters/ScoopProxy.js +++ b/intercepters/ScoopProxy.js @@ -139,6 +139,7 @@ export class ScoopProxy extends ScoopIntercepter { onConnected (serverSocket, request) { const ip = serverSocket.remoteAddress const rule = this.findMatchingBlocklistRule(ip) + if (rule) { serverSocket.destroy() this.blockRequest(request, ip, rule) @@ -147,18 +148,68 @@ export class ScoopProxy extends ScoopIntercepter { /** * On response: + * - Capture cert if relevant option is on, cert is available, and not already captured for this host * - Check for "noarchive" directive + * * @param {http.ServerResponse} response * @param {http.ClientRequest} request */ onResponse (response, request) { - // there will not be an exchange with this request if we're, for instance, not recording + // There will not be an exchange with this request if we're, for instance, not recording const exchange = this.exchanges.find(ex => ex.requestParsed === request) + // Copy response, check for no-archive directive if (exchange) { exchange.responseParsed = response response.on('end', () => this.checkExchangeForNoArchive(exchange)) } + + // Capture SSL cert + if (exchange && response.socket) { + const { provenanceInfo, options } = this.capture + const host = new URL(exchange.url).host + + if (!response.socket.getPeerCertificate || + !options.captureCertificatesAsAttachment || + provenanceInfo.certificates[host]) { + return + } + + let cert = response.socket.getPeerCertificate(true) + let pem = '' + let lastRawCert = null + + // Go through the certificate chain using its recursive `issuerCertificate` property to assemble the .pem file. + while (cert && cert?.raw && cert.raw.length > 0) { + // In some cases, "issuerCertificate" recurses infinitely. + // This breaker accounts for that edge case. + if (lastRawCert === cert.raw) { + break + } + + const certAsBase64 = cert.raw.toString('base64') + + pem += '-----BEGIN CERTIFICATE-----\n' + for (let i = 0; i < certAsBase64.length; i += 64) { + pem += certAsBase64.slice(i, i + 64) + pem += '\n' + } + pem += '-----END CERTIFICATE-----\n' + + lastRawCert = cert.raw + cert = cert?.issuerCertificate + } + + if (!pem) { + return + } + + this.capture.addCertificate(host, pem) + .catch(err => { + this.capture.log.trace(err) + this.capture.log.warn(`An error occurred while capturing certificate for "${host}".`) + }) + } } /** diff --git a/options.js b/options.js index 43706096..7e19d159 100644 --- a/options.js +++ b/options.js @@ -20,7 +20,6 @@ export const defaults = { networkIdleTimeout: 20 * 1000, behaviorsTimeout: 20 * 1000, captureVideoAsAttachmentTimeout: 30 * 1000, - captureCertificatesAsAttachmentTimeout: 10 * 1000, captureWindowX: 1600, captureWindowY: 900, @@ -73,8 +72,7 @@ export const defaults = { proxyVerbose: false, publicIpResolverEndpoint: 'https://icanhazip.com', - ytDlpPath: `${CONSTANTS.EXECUTABLES_PATH}yt-dlp`, - cripPath: `${CONSTANTS.EXECUTABLES_PATH}crip` + ytDlpPath: `${CONSTANTS.EXECUTABLES_PATH}yt-dlp` } /** @@ -133,10 +131,8 @@ export function filterOptions (newOptions = {}) { } // Check that paths are valid - for (const toCheck of ['ytDlpPath', 'cripPath']) { - if (!statSync(options[toCheck]).isFile()) { - throw new Error(`"${toCheck}" must be a path to a file.`) - } + if (!statSync(options.ytDlpPath).isFile()) { + throw new Error('"ytDlpPath" must be a path to a file.') } return options diff --git a/options.types.js b/options.types.js index f14e012f..1fb6cc97 100644 --- a/options.types.js +++ b/options.types.js @@ -16,7 +16,6 @@ * @property {number} networkIdleTimeout=20000 - How long should Scoop wait for network events to complete, in ms. * @property {number} behaviorsTimeout=20000 - How long should Scoop wait for media to play, secondary resources, and site specific behaviors (in total), in ms? * @property {number} captureVideoAsAttachmentTimeout=30000 - How long should Scoop wait for `captureVideoAsAttachment` to finish. - * @property {number} captureCertificatesAsAttachmentTimeout=10000 - How long should Scoop wait for `captureCertificatesAsAttachment` to finish. * * @property {number} captureWindowX=1600 - Browser window resolution in pixels: X axis. * @property {number} captureWindowY=900 - Browser window resolution in pixels: Y axis. @@ -39,5 +38,4 @@ * * @property {string} publicIpResolverEndpoint="https://icanhazip.com" - URL to be used to retrieve the client's public IP address for `provenanceSummary`. Endpoint requirements: must simply return a IPv4 or IPv6 address as text. * @property {string} ytDlpPath="./executables/yt-dlp" - Path to the yt-dlp executable to be used. (https://github.com/yt-dlp/yt-dlp) - * @property {string} cripPath="./executables/crip" - Path to the crip executable to be used. (https://github.com/Hakky54/certificate-ripper) */ diff --git a/postinstall.sh b/postinstall.sh index dacea8f2..371229f2 100755 --- a/postinstall.sh +++ b/postinstall.sh @@ -6,16 +6,3 @@ mkdir ./executables/; # Pull yt-dlp (v2023.03.04) curl -L https://github.com/yt-dlp/yt-dlp/releases/download/2023.03.04/yt-dlp > ./executables/yt-dlp; chmod a+x ./executables/yt-dlp; - -# Pull crip (v2.1.0) -if [ "$(uname)" == "Darwin" ]; then - curl -L https://github.com/Hakky54/certificate-ripper/releases/download/2.1.0/crip-macos-amd64.tar.gz > ./executables/crip.tar.gz; -elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then - curl -L https://github.com/Hakky54/certificate-ripper/releases/download/2.1.0/crip-linux-amd64.tar.gz > ./executables/crip.tar.gz; -fi - -cd ./executables; -tar -xzvf crip.tar.gz; -chmod a+x crip; -rm crip.tar.gz; -cd ..; \ No newline at end of file From c7c0f9d67f0ea0df89336679070f8875d32f626a Mon Sep 17 00:00:00 2001 From: Matteo Cargnelutti Date: Tue, 11 Apr 2023 11:33:29 -0400 Subject: [PATCH 2/2] Minor edits --- assets/templates/provenance-summary.njk | 2 +- intercepters/ScoopProxy.js | 81 +++++++++++++------------ 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/assets/templates/provenance-summary.njk b/assets/templates/provenance-summary.njk index 4616b4e8..df077fd1 100644 --- a/assets/templates/provenance-summary.njk +++ b/assets/templates/provenance-summary.njk @@ -169,7 +169,7 @@

    SSL/TLS Certificates

    -

    The following certificates were pulled by crip from the different origins encountered during capture.

    +

    The following certificates were pulled by from the different origins encountered during capture.

    {% for host, pem in certificates %}
  • {{ host }}
  • diff --git a/intercepters/ScoopProxy.js b/intercepters/ScoopProxy.js index f798a4ea..47ba443e 100644 --- a/intercepters/ScoopProxy.js +++ b/intercepters/ScoopProxy.js @@ -148,6 +148,7 @@ export class ScoopProxy extends ScoopIntercepter { /** * On response: + * - Copy parsed response * - Capture cert if relevant option is on, cert is available, and not already captured for this host * - Check for "noarchive" directive * @@ -158,58 +159,58 @@ export class ScoopProxy extends ScoopIntercepter { // There will not be an exchange with this request if we're, for instance, not recording const exchange = this.exchanges.find(ex => ex.requestParsed === request) - // Copy response, check for no-archive directive - if (exchange) { - exchange.responseParsed = response - response.on('end', () => this.checkExchangeForNoArchive(exchange)) + if (!exchange) { + return } - // Capture SSL cert - if (exchange && response.socket) { - const { provenanceInfo, options } = this.capture - const host = new URL(exchange.url).host - - if (!response.socket.getPeerCertificate || - !options.captureCertificatesAsAttachment || - provenanceInfo.certificates[host]) { - return - } + exchange.responseParsed = response - let cert = response.socket.getPeerCertificate(true) - let pem = '' - let lastRawCert = null + response.on('end', () => this.checkExchangeForNoArchive(exchange)) - // Go through the certificate chain using its recursive `issuerCertificate` property to assemble the .pem file. - while (cert && cert?.raw && cert.raw.length > 0) { - // In some cases, "issuerCertificate" recurses infinitely. - // This breaker accounts for that edge case. - if (lastRawCert === cert.raw) { - break - } + // Capture SSL cert + const { provenanceInfo, options } = this.capture + const host = new URL(exchange.url).host - const certAsBase64 = cert.raw.toString('base64') + if (!response.socket.getPeerCertificate || + !options.captureCertificatesAsAttachment || + provenanceInfo.certificates[host]) { + return + } - pem += '-----BEGIN CERTIFICATE-----\n' - for (let i = 0; i < certAsBase64.length; i += 64) { - pem += certAsBase64.slice(i, i + 64) - pem += '\n' - } - pem += '-----END CERTIFICATE-----\n' + let cert = response.socket.getPeerCertificate(true) + let pem = '' + let lastRawCert = null - lastRawCert = cert.raw - cert = cert?.issuerCertificate + // Go through the certificate chain using its recursive `issuerCertificate` property to assemble the .pem file. + while (cert && cert?.raw && cert.raw.length > 0) { + // In some cases, "issuerCertificate" recurses infinitely. + // This breaker accounts for that edge case. + if (lastRawCert === cert.raw) { + break } - if (!pem) { - return + const certAsBase64 = cert.raw.toString('base64') + + pem += '-----BEGIN CERTIFICATE-----\n' + for (let i = 0; i < certAsBase64.length; i += 64) { + pem += certAsBase64.slice(i, i + 64) + pem += '\n' } + pem += '-----END CERTIFICATE-----\n' - this.capture.addCertificate(host, pem) - .catch(err => { - this.capture.log.trace(err) - this.capture.log.warn(`An error occurred while capturing certificate for "${host}".`) - }) + lastRawCert = cert.raw + cert = cert?.issuerCertificate + } + + if (!pem) { + return } + + this.capture.addCertificate(host, pem) + .catch(err => { + this.capture.log.trace(err) + this.capture.log.warn(`An error occurred while capturing certificate for "${host}".`) + }) } /**