fix(scrapeURL/pdf): handle if a presumed PDF link returns HTML (e.g. …

…404)
mendableai · Dec 10, 2024 · d276a23 · d276a23
1 parent d9e017e
commit d276a23
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 22 deletions.
diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@@ -8,6 +8,7 @@ import * as Sentry from "@sentry/node";
 import escapeHtml from "escape-html";
 import PdfParse from "pdf-parse";
 import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
+import { RemoveFeatureError } from "../../error";
 
 type PDFProcessorResult = {html: string, markdown?: string};
 
@@ -52,24 +53,47 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
     const jobId = upload.id;
 
     // TODO: timeout, retries
-    const result = await robustFetch({
-        url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
-        method: "GET",
-        headers: {
-            "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
-        },
-        logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
-        schema: z.object({
-            markdown: z.string(),
-        }),
-        tryCount: meta.options.timeout !== undefined ? 32 : 1200, // 5 minutes if timeout not specified
-        tryCooldown: 250,
-    });
-
-    return {
-        markdown: result.markdown,
-        html: await marked.parse(result.markdown, { async: true }),
-    };
+    const startedAt = Date.now();
+
+    while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
+        try {
+            const result = await robustFetch({
+                url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
+                method: "GET",
+                headers: {
+                    "Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
+                },
+                logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
+                schema: z.object({
+                    markdown: z.string(),
+                }),
+            });
+            return {
+                markdown: result.markdown,
+                html: await marked.parse(result.markdown, { async: true }),
+            };
+        } catch (e) {
+            if (e instanceof Error && e.message === "Request sent failure status") {
+                if ((e.cause as any).response.status === 404) {
+                    // no-op, result not up yet
+                } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
+                    // URL is not a PDF, actually!
+                    meta.logger.debug("URL is not actually a PDF, signalling...");
+                    throw new RemoveFeatureError(["pdf"]);
+                } else {
+                    throw new Error("LlamaParse threw an error", {
+                        cause: e.cause,
+                    });
+                }
+            } else {
+                throw e;
+            }
+        }
+
+        await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
+    }
+
+    throw new Error("LlamaParse timed out");
 }
 
 async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
@@ -107,8 +131,14 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
                 logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
             }, tempFilePath);
         } catch (error) {
-            meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
-            Sentry.captureException(error);
+            if (error instanceof Error && error.message === "LlamaParse timed out") {
+                meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { error });
+            } else if (error instanceof RemoveFeatureError) {
+                throw error;
+            } else {
+                meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
+                Sentry.captureException(error);
+            }
         }
     }
 

diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts
@@ -33,6 +33,15 @@ export class AddFeatureError extends Error {
     }
 }
 
+export class RemoveFeatureError extends Error {
+    public featureFlags: FeatureFlag[];
+
+    constructor(featureFlags: FeatureFlag[]) {
+        super("Incorrect feature flags have been discovered: " + featureFlags.join(", "));
+        this.featureFlags = featureFlags;
+    }
+}
+
 export class SiteError extends Error {
     public code: string;
     constructor(code: string) {

diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts
@@ -5,7 +5,7 @@ import { Document, ScrapeOptions } from "../../controllers/v1/types";
 import { logger } from "../../lib/logger";
 import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
 import { parseMarkdown } from "../../lib/html-to-markdown";
-import { AddFeatureError, EngineError, NoEnginesLeftError, SiteError, TimeoutError } from "./error";
+import { AddFeatureError, EngineError, NoEnginesLeftError, RemoveFeatureError, SiteError, TimeoutError } from "./error";
 import { executeTransformers } from "./transformers";
 import { LLMRefusalError } from "./transformers/llmExtract";
 import { urlSpecificParams } from "./lib/urlSpecificParams";
@@ -216,7 +216,7 @@ async function scrapeURLLoop(
                     startedAt,
                     finishedAt: Date.now(),
                 };
-            } else if (error instanceof AddFeatureError) {
+            } else if (error instanceof AddFeatureError || error instanceof RemoveFeatureError) {
                 throw error;
             } else if (error instanceof LLMRefusalError) {
                 results[engine] = {
@@ -293,6 +293,9 @@ export async function scrapeURL(
                 if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) {
                     meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags });
                     meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags));
+                } else if (error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined) {
+                    meta.logger.debug("Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags });
+                    meta.featureFlags = new Set([...meta.featureFlags].filter(x => !error.featureFlags.includes(x)));
                 } else {
                     throw error;
                 }