diff --git a/package.json b/package.json index 63dd203f..cb714465 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,7 @@ "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.4.0", + "warcio": "^2.4.2", "ws": "^7.4.4", "yargs": "^17.7.2" }, diff --git a/src/util/recorder.ts b/src/util/recorder.ts index c237ce34..3b2bb6e4 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -12,7 +12,7 @@ import { fetch, getGlobalDispatcher, Response } from "undici"; import { getCustomRewriter, rewriteDASH, rewriteHLS } from "@webrecorder/wabac"; -import { WARCRecord } from "warcio"; +import { WARCRecord, multiValueHeader } from "warcio"; import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { WARCWriter } from "./warcwriter.js"; import { RedisCrawlState, WorkerId } from "./state.js"; @@ -1765,6 +1765,17 @@ function createResponse( "WARC-Page-ID": pageid, }; + if (reqresp.protocols.length) { + warcHeaders["WARC-Protocol"] = multiValueHeader( + "WARC-Protocol", + reqresp.protocols, + ); + } + + if (reqresp.cipher) { + warcHeaders["WARC-Cipher-Suite"] = reqresp.cipher; + } + if (reqresp.resourceType) { warcHeaders["WARC-Resource-Type"] = reqresp.resourceType; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 5548cb5b..a57cd14c 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -2,9 +2,11 @@ import { getCustomRewriter, getStatusText } from "@webrecorder/wabac"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; -import { HTML_TYPES } from "./constants.js"; import { Response } from "undici"; +import { HTML_TYPES } from "./constants.js"; +import { logger } from "./logger.js"; + const CONTENT_LENGTH = "content-length"; const CONTENT_RANGE = "content-range"; const RANGE = "range"; @@ -25,7 +27,9 @@ export class RequestResponseInfo { method?: string; url!: string; - protocol?: string = "HTTP/1.1"; + + protocols: string[] = []; + cipher?: string; mimeType?: string; @@ -132,7 +136,9 @@ export class RequestResponseInfo { this.setStatus(response.status, response.statusText); - this.protocol = response.protocol; + if (response.protocol) { + this.protocols.push(response.protocol); + } if (resourceType) { this.resourceType = resourceType.toLowerCase(); @@ -153,11 +159,17 @@ export class RequestResponseInfo { this.fromServiceWorker = !!response.fromServiceWorker; - if (response.securityDetails) { - const issuer: string = response.securityDetails.issuer || ""; + const { securityDetails } = response; + + if (securityDetails) { + const securityProtocol = securityDetails.protocol + .replaceAll(" ", "/") + .toLowerCase(); + this.protocols.push(securityProtocol); + this.cipher = getCipher(securityDetails, securityProtocol, this.url); + const issuer: string = securityDetails.issuer || ""; const ctc: string = - response.securityDetails.certificateTransparencyCompliance === - "compliant" + securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0"; this.extraOpts.cert = { issuer, ctc }; @@ -204,21 +216,6 @@ export class RequestResponseInfo { this.requestHeaders = params.headers; } - getResponseHeadersText() { - let headers = `${this.protocol} ${this.status} ${this.statusText}\r\n`; - - if (this.responseHeaders) { - for (const header of Object.keys(this.responseHeaders)) { - headers += `${header}: ${this.responseHeaders[header].replace( - /\n/g, - ", ", - )}\r\n`; - } - } - headers += "\r\n"; - return headers; - } - hasRequest() { return this.method && (this.requestHeaders || this.requestHeadersText); } @@ -417,3 +414,36 @@ export function isHTMLMime(mime: string) { export function isRedirectStatus(status: number) { return status >= 300 && status < 400 && status !== 304; } + +function getCipher( + { keyExchange, keyExchangeGroup, cipher }: Protocol.Network.SecurityDetails, + protocol: string, + url: string, +): string { + const key = `${keyExchange} ${keyExchangeGroup} ${cipher}`; + const mapping: Record = { + "ECDHE_RSA X25519 AES_128_GCM": "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", + "ECDHE_RSA X25519 AES_256_GCM": "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384", + " X25519Kyber768Draft00 AES_128_GCM": "TLS_AES_128_GCM_SHA256", + " X25519 AES_128_GCM": "TLS_AES_128_GCM_SHA256", + " X25519Kyber768Draft00 AES_256_GCM": "TLS_AES_256_GCM_SHA384", + " X25519 AES_256_GCM": "TLS_AES_256_GCM_SHA384", + " P-256 AES_256_GCM": "TLS_AES_256_GCM_SHA384", + }; + let cipherString = mapping[key] || ""; + if (!cipherString && protocol === "tls/1.3") { + switch (keyExchangeGroup) { + case "AES_256_GCM": + cipherString = "TLS_AES_256_GCM_SHA384"; + break; + + case "AES_128_GCM": + cipherString = "TLS_AES_128_GCM_SHA256"; + break; + } + } + if (!cipherString) { + logger.debug("No cipher for", { key, url }); + } + return cipherString; +} diff --git a/tests/pageinfo-records.test.js b/tests/pageinfo-records.test.js index 01dc77a4..15a0a98b 100644 --- a/tests/pageinfo-records.test.js +++ b/tests/pageinfo-records.test.js @@ -24,6 +24,12 @@ test("run warc and ensure pageinfo records contain the correct resources", async let foundInvalid = false; for await (const record of parser) { + if (record.warcType === "response" && + (record.warcTargetURI === "https://oldwebrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) { + expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3"); + expect(record.warcHeaders.headers.get("WARC-Cipher-Suite")).toBe("TLS_AES_128_GCM_SHA256"); + } + if ( !foundIndex && record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/" diff --git a/yarn.lock b/yarn.lock index 2a398802..de544776 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5281,7 +5281,7 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.3.1: +warcio@^2.3.1, warcio@^2.4.0: version "2.3.1" resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.3.1.tgz#8ac9de897de1a556161168f2a3938b60929908ca" integrity sha512-PjcWqzXfs6HdWfHi1V/i8MoMmV5M0Csg3rOa2mqCJ1dmCJXswVfQ0VXbEVumwavNIW2oFFj6LJoCHHeL4Ls/zw== @@ -5295,10 +5295,10 @@ warcio@^2.3.1: uuid-random "^1.3.2" yargs "^17.6.2" -warcio@^2.4.0: - version "2.4.0" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.0.tgz#13bae2837f1bbf5cf7585f75857e6311d30557bd" - integrity sha512-EfxXCgsnZ35CGf2j99QBMyB6EI98KEQ6YmeER+8Lnv/4KFJ3thT76PiX37HfZVbPJS21JihA0Eddjk9QBQRlPg== +warcio@^2.4.2: + version "2.4.2" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.2.tgz#782d8dcb0769f271b0ae96521fb4969e2570e9b3" + integrity sha512-QYbZ3EGYtnAIrzL7Bajo7ak87pipilpkIfaFIzFQWUX4wuXNuKqnfQy/EAoi2tEIl3VJgsWcL+wjjk4+15MKbQ== dependencies: "@types/pako" "^1.0.7" "@types/stream-buffers" "^3.0.7"