diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 19f0a7a87..b441943c0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -8,6 +8,7 @@ import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; +import { RemoveFeatureError } from "../../error"; type PDFProcessorResult = {html: string, markdown?: string}; @@ -52,24 +53,47 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis const jobId = upload.id; // TODO: timeout, retries - const result = await robustFetch({ - url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, - method: "GET", - headers: { - "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`, - }, - logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }), - schema: z.object({ - markdown: z.string(), - }), - tryCount: meta.options.timeout !== undefined ? 32 : 1200, // 5 minutes if timeout not specified - tryCooldown: 250, - }); - - return { - markdown: result.markdown, - html: await marked.parse(result.markdown, { async: true }), - }; + const startedAt = Date.now(); + + while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) { + try { + const result = await robustFetch({ + url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, + method: "GET", + headers: { + "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`, + }, + logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }), + schema: z.object({ + markdown: z.string(), + }), + }); + return { + markdown: result.markdown, + html: await marked.parse(result.markdown, { async: true }), + }; + } catch (e) { + if (e instanceof Error && e.message === "Request sent failure status") { + if ((e.cause as any).response.status === 404) { + // no-op, result not up yet + } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) { + // URL is not a PDF, actually! + meta.logger.debug("URL is not actually a PDF, signalling..."); + throw new RemoveFeatureError(["pdf"]); + } else { + throw new Error("LlamaParse threw an error", { + cause: e.cause, + }); + } + } else { + throw e; + } + } + + await new Promise((resolve) => setTimeout(() => resolve(), 250)); + } + + throw new Error("LlamaParse timed out"); } async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise { @@ -107,8 +131,14 @@ export async function scrapePDF(meta: Meta): Promise { logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }), }, tempFilePath); } catch (error) { - meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error }); - Sentry.captureException(error); + if (error instanceof Error && error.message === "LlamaParse timed out") { + meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { error }); + } else if (error instanceof RemoveFeatureError) { + throw error; + } else { + meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error }); + Sentry.captureException(error); + } } } diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 3eb46033b..ccd7a359f 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -33,6 +33,15 @@ export class AddFeatureError extends Error { } } +export class RemoveFeatureError extends Error { + public featureFlags: FeatureFlag[]; + + constructor(featureFlags: FeatureFlag[]) { + super("Incorrect feature flags have been discovered: " + featureFlags.join(", ")); + this.featureFlags = featureFlags; + } +} + export class SiteError extends Error { public code: string; constructor(code: string) { diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index ad006d5a5..f394ca2b6 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -5,7 +5,7 @@ import { Document, ScrapeOptions } from "../../controllers/v1/types"; import { logger } from "../../lib/logger"; import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines"; import { parseMarkdown } from "../../lib/html-to-markdown"; -import { AddFeatureError, EngineError, NoEnginesLeftError, SiteError, TimeoutError } from "./error"; +import { AddFeatureError, EngineError, NoEnginesLeftError, RemoveFeatureError, SiteError, TimeoutError } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; import { urlSpecificParams } from "./lib/urlSpecificParams"; @@ -216,7 +216,7 @@ async function scrapeURLLoop( startedAt, finishedAt: Date.now(), }; - } else if (error instanceof AddFeatureError) { + } else if (error instanceof AddFeatureError || error instanceof RemoveFeatureError) { throw error; } else if (error instanceof LLMRefusalError) { results[engine] = { @@ -293,6 +293,9 @@ export async function scrapeURL( if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) { meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags }); meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags)); + } else if (error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined) { + meta.logger.debug("Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags }); + meta.featureFlags = new Set([...meta.featureFlags].filter(x => !error.featureFlags.includes(x))); } else { throw error; }