Skip to content

Commit

Permalink
Merge pull request #233 from harvard-lil/perf-debug
Browse files Browse the repository at this point in the history
Potential fixes for capture of attachments edge cases
  • Loading branch information
matteocargnelutti authored Oct 30, 2023
2 parents f5eb47e + 3a8c8ca commit b253c75
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 26 deletions.
50 changes: 31 additions & 19 deletions Scoop.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ export class Scoop {
steps.push({
name: 'Out-of-browser detection and capture of non-web resource',
alwaysRun: true,
webPageOnly: false,
main: async (page) => {
await this.#detectAndCaptureNonWebContent(page)
}
Expand All @@ -258,6 +259,7 @@ export class Scoop {
// Push step: Wait for initial page load
steps.push({
name: 'Wait for initial page load',
alwaysRun: false,
webPageOnly: true,
main: async (page) => {
await page.goto(this.url, { waitUntil: 'load', timeout: options.loadTimeout })
Expand All @@ -283,6 +285,7 @@ export class Scoop {
) {
steps.push({
name: 'Browser scripts',
alwaysRun: false,
webPageOnly: true,
setup: async (page) => {
// Determine path of `behaviors.js`
Expand Down Expand Up @@ -319,6 +322,7 @@ export class Scoop {
// Push step: Wait for network idle
steps.push({
name: 'Wait for network idle',
alwaysRun: false,
webPageOnly: true,
main: async (page) => {
await page.waitForLoadState('networkidle', { timeout: options.networkIdleTimeout })
Expand All @@ -331,20 +335,23 @@ export class Scoop {
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: true,
main: async (page) => {
await page.evaluate(() => window.scrollTo(0, 0))
await Promise.race([
page.evaluate(() => window.scrollTo(0, 0)),
new Promise(resolve => setTimeout(resolve, 2500)) // Only wait for up to 2.5s for scroll up to happen
])
}
})

// Push step: Screenshot
if (options.screenshot) {
steps.push({
name: 'Screenshot',
webPageOnly: true,
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: true,
main: async (page) => {
const url = 'file:///screenshot.png'
const httpHeaders = new Headers({ 'content-type': 'image/png' })
const body = await page.screenshot({ fullPage: true })
const body = await page.screenshot({ fullPage: true, timeout: 5000 })
const isEntryPoint = true
const description = `Capture Time Screenshot of ${this.url}`

Expand All @@ -357,8 +364,8 @@ export class Scoop {
if (options.domSnapshot) {
steps.push({
name: 'DOM snapshot',
webPageOnly: true,
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: true,
main: async (page) => {
const url = 'file:///dom-snapshot.html'
const httpHeaders = new Headers({
Expand All @@ -378,8 +385,8 @@ export class Scoop {
if (options.pdfSnapshot) {
steps.push({
name: 'PDF snapshot',
webPageOnly: true,
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: true,
main: async (page) => {
await this.#takePdfSnapshot(page)
}
Expand All @@ -390,8 +397,8 @@ export class Scoop {
if (options.captureVideoAsAttachment) {
steps.push({
name: 'Out-of-browser capture of video as attachment (if any)',
webPageOnly: true,
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: true,
main: async () => {
await this.#captureVideoAsAttachment()
}
Expand All @@ -403,6 +410,7 @@ export class Scoop {
steps.push({
name: 'Capturing certificates info',
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: false,
main: async () => {
await this.#captureCertificatesAsAttachment()
}
Expand All @@ -414,6 +422,7 @@ export class Scoop {
steps.push({
name: 'Provenance summary',
alwaysRun: options.attachmentsBypassLimits,
webPageOnly: false,
main: async (page) => {
await this.#captureProvenanceInfo(page)
}
Expand All @@ -432,7 +441,7 @@ export class Scoop {
this.log.info(`🍨 Starting capture of ${this.url}.`)
this.state = Scoop.states.CAPTURE
} catch (err) {
this.log.error('An error ocurred during capture setup.')
this.log.error('An error occurred during capture setup.')
this.log.trace(err)
this.state = Scoop.states.FAILED
return // exit early if the browser and proxy couldn't be launched
Expand Down Expand Up @@ -464,7 +473,7 @@ export class Scoop {
}

// Page was closed
if (page.isClosed()) {
if (this.targetUrlIsWebPage && page.isClosed()) {
this.log.error('Page closed before it could be captured.')
shouldStop = true
}
Expand All @@ -479,7 +488,7 @@ export class Scoop {
//
try {
// Only if state is `CAPTURE`, unless `alwaysRun` is set for step
let shouldRun = this.state === Scoop.states.CAPTURE || step.alwaysRun
let shouldRun = this.state === Scoop.states.CAPTURE || step.alwaysRun === true

// BUT: `webPageOnly` takes precedence - allows for skipping unnecessary steps when capturing non-web content
if (this.targetUrlIsWebPage === false && step.webPageOnly) {
Expand All @@ -503,7 +512,7 @@ export class Scoop {
// Check capture state every second - so current step can be interrupted if state changes
new Promise(resolve => {
stateCheckInterval = setInterval(() => {
if (this.state !== Scoop.states.CAPTURE) {
if (this.state !== Scoop.states.CAPTURE && step.alwaysRun !== true) {
resolve(true)
}
}, 1000)
Expand Down Expand Up @@ -585,6 +594,7 @@ export class Scoop {

// Playwright init + pass proxy info to Chromium
const userAgent = chromium._playwright.devices['Desktop Chrome'].userAgent + options.userAgentSuffix
this.provenanceInfo.userAgent = userAgent
this.log.info(`User Agent used for capture: ${userAgent}`)

this.#browser = await chromium.launch({
Expand Down Expand Up @@ -618,7 +628,7 @@ export class Scoop {
}

/**
* Tears down Playwright, intercepter resources, and capture-specific temporary folder.
* Tears down Playwright, intercepter, and capture-specific temporary folder.
* @returns {Promise<void>}
*/
async teardown () {
Expand Down Expand Up @@ -728,7 +738,7 @@ export class Scoop {
// Capture using curl behind proxy
//
try {
const userAgent = await page.evaluate(() => window.navigator.userAgent) // Source user agent from the browser
const userAgent = this.provenanceInfo.userAgent

let timeout = this.options.captureTimeout - headRequestTimeMs

Expand Down Expand Up @@ -811,7 +821,7 @@ export class Scoop {
// If `headless`: request the favicon using curl so it's added to the exchanges list.
if (this.options.headless) {
try {
const userAgent = await page.evaluate(() => window.navigator.userAgent) // Source user agent from the browser
const userAgent = this.provenanceInfo.userAgent

const timeout = 1000

Expand All @@ -833,7 +843,7 @@ export class Scoop {

// Look for favicon in exchanges
for (const exchange of this.intercepter.exchanges) {
if (exchange?.url && exchange.url === this.pageInfo.faviconUrl) {
if (exchange?.url && exchange.url === this.pageInfo.faviconUrl && exchange?.response?.body) {
this.pageInfo.favicon = exchange.response.body
}
}
Expand Down Expand Up @@ -1171,14 +1181,17 @@ export class Scoop {
async #captureProvenanceInfo (page) {
let captureIp = 'UNKNOWN'
const osInfo = await getOSInfo()
const userAgent = await page.evaluate(() => window.navigator.userAgent) // Source user agent from the browser in case it was altered during capture
let ytDlpHash = ''
let cripHash = ''

// Grab public IP address
// Grab public IP address - uses CURL
try {
const response = await fetch(this.options.publicIpResolverEndpoint)
const ip = (await response.text()).trim()
const response = await exec('curl', [
this.options.publicIpResolverEndpoint,
'--max-time', '3'
])

const ip = response.trim()

try {
new Address4(ip) // eslint-disable-line
Expand Down Expand Up @@ -1224,7 +1237,6 @@ export class Scoop {
this.provenanceInfo = {
...this.provenanceInfo,
captureIp,
userAgent,
software: CONSTANTS.SOFTWARE,
version: CONSTANTS.VERSION,
osType: os.type(),
Expand Down
10 changes: 5 additions & 5 deletions intercepters/ScoopProxy.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ export class ScoopProxy extends ScoopIntercepter {

/**
* Attempts to close the proxy server. Skips after X seconds if unable to do so.
* @returns {Promise<void>}
* @returns {Promise<boolean>}
*/
teardown () {
let closeTimeout = null
Expand All @@ -91,16 +91,16 @@ export class ScoopProxy extends ScoopIntercepter {
// server.close does not close keep-alive connections so do so here
this.#connection.closeAllConnections()
this.#connection.close(() => {
this.capture.log.info('TCP-Proxy-Server closed')
clearTimeout(closeTimeout)
resolve()
this.capture.log.info('TCP-Proxy-Server closed')
resolve(true)
})
}),

new Promise(resolve => {
closeTimeout = setTimeout(() => {
this.capture.log.warn('TCP-Proxy-Server did not close properly.')
resolve()
this.capture.log.warn('TCP-Proxy-Server did not close properly')
resolve(false)
}, 5000)
})
])
Expand Down
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"@harvard-lil/portal": "^0.0.2",
"@laverdet/beaugunderson-ip-address": "^8.1.0",
"@playwright/browser-chromium": "^1.39.0",
"browsertrix-behaviors": "^0.5.0-beta.0",
"browsertrix-behaviors": "0.5.2",
"chalk": "^5.2.0",
"commander": "^11.0.0",
"get-os-info": "^1.0.2",
Expand Down

0 comments on commit b253c75

Please sign in to comment.