From ed7d15d2af23409e351df5a3574ef565f06a29d6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:50:29 -0500 Subject: [PATCH 1/3] Update index.ts --- .../scraper/scrapeURL/engines/pdf/index.ts | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 24d5f002a..38dc6b5ee 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { + + // First, try parsing with PdfParse + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + + + // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { try { - result = await scrapePDFWithLlamaParse( + const llamaResult = await scrapePDFWithLlamaParse( { ...meta, logger: meta.logger.child({ @@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, timeToRun, ); + result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- falling back to parse-pdf", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom } } - if (result === null) { - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); - } - await fs.unlink(tempFilePath); return { From 1402831a0aef09b5a74fed5711efd4627af34e2f Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:59:52 -0500 Subject: [PATCH 2/3] Replace pdf parse with pdf to md --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 288 ++++++++++++++++++ .../scraper/scrapeURL/engines/pdf/index.ts | 15 +- 3 files changed, 297 insertions(+), 7 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index c4e70901d..f4ac48ffc 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -58,6 +58,7 @@ "@devil7softwares/pos": "^1.0.2", "@dqbd/tiktoken": "^1.0.17", "@nangohq/node": "^0.40.8", + "@opendocsg/pdf2md": "^0.2.1", "@sentry/cli": "^2.33.1", "@sentry/node": "^8.26.0", "@sentry/profiling-node": "^8.26.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 6d9717087..68d436550 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -29,6 +29,9 @@ importers: '@nangohq/node': specifier: ^0.40.8 version: 0.40.8 + '@opendocsg/pdf2md': + specifier: ^0.2.1 + version: 0.2.1 '@sentry/cli': specifier: ^2.33.1 version: 2.33.1 @@ -794,6 +797,10 @@ packages: resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==} engines: {node: '>=18'} + '@mapbox/node-pre-gyp@1.0.11': + resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} + hasBin: true + '@mixmark-io/domino@2.2.0': resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} @@ -837,6 +844,10 @@ packages: '@one-ini/wasm@0.1.1': resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==} + '@opendocsg/pdf2md@0.2.1': + resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==} + hasBin: true + '@opentelemetry/api-logs@0.52.1': resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==} engines: {node: '>=14'} @@ -1615,6 +1626,9 @@ packages: resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} engines: {node: '>=10.0.0'} + abbrev@1.1.1: + resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} + abbrev@2.0.0: resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -1708,6 +1722,14 @@ packages: resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==} engines: {node: '>=0.2.6'} + aproba@2.0.0: + resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==} + + are-we-there-yet@2.0.0: + resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + arg@4.1.3: resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==} @@ -1908,6 +1930,10 @@ packages: caniuse-lite@1.0.30001627: resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==} + canvas@2.11.2: + resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} + engines: {node: '>=6'} + chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -1938,6 +1964,10 @@ packages: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} + chownr@2.0.0: + resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} + engines: {node: '>=10'} + chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -1995,6 +2025,10 @@ packages: color-string@1.9.1: resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==} + color-support@1.1.3: + resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} + hasBin: true + color@3.2.1: resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==} @@ -2021,6 +2055,9 @@ packages: config-chain@1.1.13: resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==} + console-control-strings@1.1.0: + resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==} + content-disposition@0.5.4: resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==} engines: {node: '>= 0.6'} @@ -2147,6 +2184,10 @@ packages: resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==} engines: {node: '>=10'} + decompress-response@4.2.1: + resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==} + engines: {node: '>=8'} + dedent@1.5.3: resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==} peerDependencies: @@ -2171,6 +2212,9 @@ packages: resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==} engines: {node: '>=0.4.0'} + delegates@1.0.0: + resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==} + denque@2.1.0: resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==} engines: {node: '>=0.10'} @@ -2283,6 +2327,9 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} + enumify@1.0.4: + resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==} + env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2484,6 +2531,10 @@ packages: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} + fs-minipass@2.1.0: + resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==} + engines: {node: '>= 8'} + fs.realpath@1.0.0: resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==} @@ -2495,6 +2546,11 @@ packages: function-bind@1.1.2: resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==} + gauge@3.0.2: + resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + generic-pool@3.9.0: resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==} engines: {node: '>= 4'} @@ -2578,6 +2634,9 @@ packages: resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==} engines: {node: '>= 0.4'} + has-unicode@2.0.1: + resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==} + hasown@2.0.2: resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==} engines: {node: '>= 0.4'} @@ -3256,6 +3315,10 @@ packages: resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==} engines: {node: '>=12'} + make-dir@3.1.0: + resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} + engines: {node: '>=8'} + make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -3326,6 +3389,10 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} + mimic-response@2.1.0: + resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==} + engines: {node: '>=8'} + minimatch@3.1.2: resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==} @@ -3344,10 +3411,22 @@ packages: minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} + minipass@3.3.6: + resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==} + engines: {node: '>=8'} + + minipass@5.0.0: + resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==} + engines: {node: '>=8'} + minipass@7.1.2: resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==} engines: {node: '>=16 || 14 >=14.17'} + minizlib@2.1.2: + resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==} + engines: {node: '>= 8'} + minizlib@3.0.1: resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==} engines: {node: '>= 18'} @@ -3359,6 +3438,11 @@ packages: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true + mkdirp@1.0.4: + resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==} + engines: {node: '>=10'} + hasBin: true + mkdirp@3.0.1: resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==} engines: {node: '>=10'} @@ -3447,6 +3531,9 @@ packages: resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==} hasBin: true + nan@2.22.0: + resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==} + natural-compare@1.4.0: resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==} @@ -3507,6 +3594,11 @@ packages: engines: {node: '>=8.10.0'} hasBin: true + nopt@5.0.0: + resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} + engines: {node: '>=6'} + hasBin: true + nopt@7.2.1: resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -3524,6 +3616,10 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} + npmlog@5.0.1: + resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} + deprecated: This package is no longer supported. + nth-check@2.1.1: resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} @@ -3949,6 +4045,11 @@ packages: resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==} engines: {node: '>= 4'} + rimraf@3.0.2: + resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==} + deprecated: Rimraf versions prior to v4 are no longer supported + hasBin: true + rimraf@5.0.7: resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==} engines: {node: '>=14.18'} @@ -4019,6 +4120,9 @@ packages: resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==} engines: {node: '>= 0.8.0'} + set-blocking@2.0.0: + resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==} + set-function-length@1.2.2: resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==} engines: {node: '>= 0.4'} @@ -4057,6 +4161,12 @@ packages: resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==} engines: {node: '>=14'} + simple-concat@1.0.1: + resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==} + + simple-get@3.1.1: + resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==} + simple-swizzle@0.2.2: resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==} @@ -4215,6 +4325,10 @@ packages: tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} + tar@6.2.1: + resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} + engines: {node: '>=10'} + tar@7.2.0: resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==} engines: {node: '>=18'} @@ -4369,6 +4483,9 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} + unpdf@0.12.1: + resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==} + unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -4456,6 +4573,9 @@ packages: engines: {node: '>= 8'} hasBin: true + wide-align@1.1.5: + resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} + winston-transport@4.8.0: resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==} engines: {node: '>= 12.0.0'} @@ -5607,6 +5727,22 @@ snapshots: - langchain - openai + '@mapbox/node-pre-gyp@1.0.11': + dependencies: + detect-libc: 2.0.3 + https-proxy-agent: 5.0.1 + make-dir: 3.1.0 + node-fetch: 2.7.0 + nopt: 5.0.0 + npmlog: 5.0.1 + rimraf: 3.0.2 + semver: 7.6.2 + tar: 6.2.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + '@mixmark-io/domino@2.2.0': {} '@mongodb-js/saslprep@1.1.7': @@ -5639,6 +5775,15 @@ snapshots: '@one-ini/wasm@0.1.1': {} + '@opendocsg/pdf2md@0.2.1': + dependencies: + enumify: 1.0.4 + minimist: 1.2.8 + unpdf: 0.12.1 + transitivePeerDependencies: + - encoding + - supports-color + '@opentelemetry/api-logs@0.52.1': dependencies: '@opentelemetry/api': 1.9.0 @@ -6673,6 +6818,9 @@ snapshots: '@xmldom/xmldom@0.8.10': {} + abbrev@1.1.1: + optional: true + abbrev@2.0.0: {} abort-controller@3.0.0: @@ -6755,6 +6903,15 @@ snapshots: dependencies: sylvester: 0.0.12 + aproba@2.0.0: + optional: true + + are-we-there-yet@2.0.0: + dependencies: + delegates: 1.0.0 + readable-stream: 3.6.2 + optional: true + arg@4.1.3: {} argparse@1.0.10: @@ -7008,6 +7165,16 @@ snapshots: caniuse-lite@1.0.30001627: {} + canvas@2.11.2: + dependencies: + '@mapbox/node-pre-gyp': 1.0.11 + nan: 2.22.0 + simple-get: 3.1.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + chalk@2.4.2: dependencies: ansi-styles: 3.2.1 @@ -7056,6 +7223,9 @@ snapshots: optionalDependencies: fsevents: 2.3.3 + chownr@2.0.0: + optional: true + chownr@3.0.0: {} chromium-bidi@0.5.24(devtools-protocol@0.0.1299070): @@ -7121,6 +7291,9 @@ snapshots: color-name: 1.1.4 simple-swizzle: 0.2.2 + color-support@1.1.3: + optional: true + color@3.2.1: dependencies: color-convert: 1.9.3 @@ -7148,6 +7321,9 @@ snapshots: ini: 1.3.8 proto-list: 1.2.4 + console-control-strings@1.1.0: + optional: true + content-disposition@0.5.4: dependencies: safe-buffer: 5.2.1 @@ -7255,6 +7431,11 @@ snapshots: decamelize@4.0.0: {} + decompress-response@4.2.1: + dependencies: + mimic-response: 2.1.0 + optional: true + dedent@1.5.3: {} deepmerge@4.3.1: {} @@ -7273,6 +7454,9 @@ snapshots: delayed-stream@1.0.0: {} + delegates@1.0.0: + optional: true + denque@2.1.0: {} depd@2.0.0: {} @@ -7364,6 +7548,8 @@ snapshots: entities@4.5.0: {} + enumify@1.0.4: {} + env-paths@2.2.1: {} error-ex@1.3.2: @@ -7589,6 +7775,11 @@ snapshots: jsonfile: 6.1.0 universalify: 2.0.1 + fs-minipass@2.1.0: + dependencies: + minipass: 3.3.6 + optional: true + fs.realpath@1.0.0: {} fsevents@2.3.3: @@ -7596,6 +7787,19 @@ snapshots: function-bind@1.1.2: {} + gauge@3.0.2: + dependencies: + aproba: 2.0.0 + color-support: 1.1.3 + console-control-strings: 1.1.0 + has-unicode: 2.0.1 + object-assign: 4.1.1 + signal-exit: 3.0.7 + string-width: 4.2.3 + strip-ansi: 6.0.1 + wide-align: 1.1.5 + optional: true + generic-pool@3.9.0: {} gensync@1.0.0-beta.2: {} @@ -7683,6 +7887,9 @@ snapshots: has-symbols@1.0.3: {} + has-unicode@2.0.1: + optional: true + hasown@2.0.2: dependencies: function-bind: 1.1.2 @@ -8463,6 +8670,11 @@ snapshots: luxon@3.4.4: {} + make-dir@3.1.0: + dependencies: + semver: 6.3.1 + optional: true + make-dir@4.0.0: dependencies: semver: 7.6.2 @@ -8523,6 +8735,9 @@ snapshots: mimic-fn@2.1.0: {} + mimic-response@2.1.0: + optional: true + minimatch@3.1.2: dependencies: brace-expansion: 1.1.11 @@ -8541,8 +8756,22 @@ snapshots: minimist@1.2.8: {} + minipass@3.3.6: + dependencies: + yallist: 4.0.0 + optional: true + + minipass@5.0.0: + optional: true + minipass@7.1.2: {} + minizlib@2.1.2: + dependencies: + minipass: 3.3.6 + yallist: 4.0.0 + optional: true + minizlib@3.0.1: dependencies: minipass: 7.1.2 @@ -8554,6 +8783,9 @@ snapshots: dependencies: minimist: 1.2.8 + mkdirp@1.0.4: + optional: true + mkdirp@3.0.1: {} ml-array-mean@1.1.6: @@ -8646,6 +8878,9 @@ snapshots: mustache@4.2.0: {} + nan@2.22.0: + optional: true + natural-compare@1.4.0: {} natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3): @@ -8726,6 +8961,11 @@ snapshots: touch: 3.1.1 undefsafe: 2.0.5 + nopt@5.0.0: + dependencies: + abbrev: 1.1.1 + optional: true + nopt@7.2.1: dependencies: abbrev: 2.0.0 @@ -8738,6 +8978,14 @@ snapshots: dependencies: path-key: 3.1.1 + npmlog@5.0.1: + dependencies: + are-we-there-yet: 2.0.0 + console-control-strings: 1.1.0 + gauge: 3.0.2 + set-blocking: 2.0.0 + optional: true + nth-check@2.1.1: dependencies: boolbase: 1.0.0 @@ -9210,6 +9458,11 @@ snapshots: retry@0.13.1: {} + rimraf@3.0.2: + dependencies: + glob: 7.2.3 + optional: true + rimraf@5.0.7: dependencies: glob: 10.4.2 @@ -9285,6 +9538,9 @@ snapshots: transitivePeerDependencies: - supports-color + set-blocking@2.0.0: + optional: true + set-function-length@1.2.2: dependencies: define-data-property: 1.1.4 @@ -9321,6 +9577,16 @@ snapshots: signal-exit@4.1.0: {} + simple-concat@1.0.1: + optional: true + + simple-get@3.1.1: + dependencies: + decompress-response: 4.2.1 + once: 1.4.0 + simple-concat: 1.0.1 + optional: true + simple-swizzle@0.2.2: dependencies: is-arrayish: 0.3.2 @@ -9494,6 +9760,16 @@ snapshots: fast-fifo: 1.3.2 streamx: 2.18.0 + tar@6.2.1: + dependencies: + chownr: 2.0.0 + fs-minipass: 2.1.0 + minipass: 5.0.0 + minizlib: 2.1.2 + mkdirp: 1.0.4 + yallist: 4.0.0 + optional: true + tar@7.2.0: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -9626,6 +9902,13 @@ snapshots: universalify@2.0.1: {} + unpdf@0.12.1: + optionalDependencies: + canvas: 2.11.2 + transitivePeerDependencies: + - encoding + - supports-color + unpipe@1.0.0: {} unstructured-client@0.11.3(zod@3.23.8): @@ -9698,6 +9981,11 @@ snapshots: dependencies: isexe: 2.0.0 + wide-align@1.1.5: + dependencies: + string-width: 4.2.3 + optional: true + winston-transport@4.8.0: dependencies: logform: 2.6.1 diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 38dc6b5ee..c3baa6944 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch"; import { z } from "zod"; import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; -import PdfParse from "pdf-parse"; +import pdf2md from "@opendocsg/pdf2md"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { RemoveFeatureError } from "../../error"; @@ -113,10 +113,11 @@ async function scrapePDFWithParsePDF( meta: Meta, tempFilePath: string, ): Promise { - meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); + meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath }); - const result = await PdfParse(await fs.readFile(tempFilePath)); - const escaped = escapeHtml(result.text); + const pdfBuffer = await fs.readFile(tempFilePath); + const markdown = await pdf2md(pdfBuffer); + const escaped = escapeHtml(markdown); return { markdown: escaped, @@ -141,7 +142,7 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom let result: PDFProcessorResult | null = null; - // First, try parsing with PdfParse + // First, try parsing with pdf2md result = await scrapePDFWithParsePDF( { ...meta, @@ -169,14 +170,14 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { + meta.logger.warn("LlamaParse timed out -- using pdf2md result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- using parse-pdf result", + "LlamaParse failed to parse PDF -- using pdf2md result", { error }, ); Sentry.captureException(error); From 194353af0db6f4734845940589ae233eb1a124f3 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 10:04:20 -0500 Subject: [PATCH 3/3] Remove pdf parse --- apps/api/package.json | 1 - apps/api/pnpm-lock.yaml | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index f4ac48ffc..1ccfee5e2 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -101,7 +101,6 @@ "mongoose": "^8.4.4", "natural": "^7.0.7", "openai": "^4.57.0", - "pdf-parse": "^1.1.1", "pos": "^0.4.2", "posthog-node": "^4.0.1", "promptable": "^0.0.10", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 68d436550..9014c42e7 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -158,9 +158,6 @@ importers: openai: specifier: ^4.57.0 version: 4.57.0(zod@3.23.8) - pdf-parse: - specifier: ^1.1.1 - version: 1.1.1 pos: specifier: ^0.4.2 version: 0.4.2 @@ -8927,7 +8924,8 @@ snapshots: node-domexception@1.0.0: {} - node-ensure@0.0.0: {} + node-ensure@0.0.0: + optional: true node-fetch@2.7.0: dependencies: @@ -9157,6 +9155,7 @@ snapshots: node-ensure: 0.0.0 transitivePeerDependencies: - supports-color + optional: true peberminta@0.9.0: {}