Skip to content

Commit

Permalink
fix(scrapeURL/pdf): handle if a presumed PDF link returns HTML (e.g. …
Browse files Browse the repository at this point in the history
…404)
  • Loading branch information
mogery committed Dec 10, 2024
1 parent d9e017e commit d276a23
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 22 deletions.
70 changes: 50 additions & 20 deletions apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError } from "../../error";

type PDFProcessorResult = {html: string, markdown?: string};

Expand Down Expand Up @@ -52,24 +53,47 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
const jobId = upload.id;

// TODO: timeout, retries
const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
method: "GET",
headers: {
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
},
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
schema: z.object({
markdown: z.string(),
}),
tryCount: meta.options.timeout !== undefined ? 32 : 1200, // 5 minutes if timeout not specified
tryCooldown: 250,
});

return {
markdown: result.markdown,
html: await marked.parse(result.markdown, { async: true }),
};
const startedAt = Date.now();

while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
try {
const result = await robustFetch({
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
method: "GET",
headers: {
"Authorization": `Bearer ${process.env.LLAMAPARSE_API_KEY}`,
},
logger: meta.logger.child({ method: "scrapePDFWithLlamaParse/result/robustFetch" }),
schema: z.object({
markdown: z.string(),
}),
});
return {
markdown: result.markdown,
html: await marked.parse(result.markdown, { async: true }),
};
} catch (e) {
if (e instanceof Error && e.message === "Request sent failure status") {
if ((e.cause as any).response.status === 404) {
// no-op, result not up yet
} else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
// URL is not a PDF, actually!
meta.logger.debug("URL is not actually a PDF, signalling...");
throw new RemoveFeatureError(["pdf"]);
} else {
throw new Error("LlamaParse threw an error", {
cause: e.cause,
});
}
} else {
throw e;
}
}

await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
}

throw new Error("LlamaParse timed out");
}

async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
Expand Down Expand Up @@ -107,8 +131,14 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
}, tempFilePath);
} catch (error) {
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
Sentry.captureException(error);
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { error });
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
Sentry.captureException(error);
}
}
}

Expand Down
9 changes: 9 additions & 0 deletions apps/api/src/scraper/scrapeURL/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ export class AddFeatureError extends Error {
}
}

export class RemoveFeatureError extends Error {
public featureFlags: FeatureFlag[];

constructor(featureFlags: FeatureFlag[]) {
super("Incorrect feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags;
}
}

export class SiteError extends Error {
public code: string;
constructor(code: string) {
Expand Down
7 changes: 5 additions & 2 deletions apps/api/src/scraper/scrapeURL/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { Document, ScrapeOptions } from "../../controllers/v1/types";
import { logger } from "../../lib/logger";
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
import { parseMarkdown } from "../../lib/html-to-markdown";
import { AddFeatureError, EngineError, NoEnginesLeftError, SiteError, TimeoutError } from "./error";
import { AddFeatureError, EngineError, NoEnginesLeftError, RemoveFeatureError, SiteError, TimeoutError } from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
import { urlSpecificParams } from "./lib/urlSpecificParams";
Expand Down Expand Up @@ -216,7 +216,7 @@ async function scrapeURLLoop(
startedAt,
finishedAt: Date.now(),
};
} else if (error instanceof AddFeatureError) {
} else if (error instanceof AddFeatureError || error instanceof RemoveFeatureError) {
throw error;
} else if (error instanceof LLMRefusalError) {
results[engine] = {
Expand Down Expand Up @@ -293,6 +293,9 @@ export async function scrapeURL(
if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) {
meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags });
meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags));
} else if (error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined) {
meta.logger.debug("Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags });
meta.featureFlags = new Set([...meta.featureFlags].filter(x => !error.featureFlags.includes(x)));
} else {
throw error;
}
Expand Down

0 comments on commit d276a23

Please sign in to comment.