From 8e85ba75fff5ff3419acebd694c2b471af05bd6c Mon Sep 17 00:00:00 2001 From: linhkhanhhoang <93673250+linhkhanhhoang@users.noreply.github.com> Date: Mon, 27 May 2024 21:31:43 +0700 Subject: [PATCH] pdf parsing file from s3 --- server/package-lock.json | 26 ++++++++++++++++++++++++++ server/package.json | 1 + server/src/utils/extractPDF.js | 21 +++++++++++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 server/src/utils/extractPDF.js diff --git a/server/package-lock.json b/server/package-lock.json index a174001..47fb600 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -24,6 +24,7 @@ "mongoose": "^8.2.3", "multer": "^1.4.5-lts.1", "nodemon": "^3.1.0", + "pdf-parse": "^1.1.1", "serverless": "^3.38.0", "serverless-http": "^3.2.0", "supertest": "^6.3.4", @@ -10390,6 +10391,11 @@ "node": ">= 0.10.5" } }, + "node_modules/node-ensure": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/node-ensure/-/node-ensure-0.0.0.tgz", + "integrity": "sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==" + }, "node_modules/node-fetch": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", @@ -10952,6 +10958,26 @@ "resolved": "https://registry.npmjs.org/path2/-/path2-0.1.0.tgz", "integrity": "sha512-TX+cz8Jk+ta7IvRy2FAej8rdlbrP0+uBIkP/5DTODez/AuL/vSb30KuAdDxGVREXzn8QfAiu5mJYJ1XjbOhEPA==" }, + "node_modules/pdf-parse": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.1.tgz", + "integrity": "sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==", + "dependencies": { + "debug": "^3.1.0", + "node-ensure": "^0.0.0" + }, + "engines": { + "node": ">=6.8.1" + } + }, + "node_modules/pdf-parse/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dependencies": { + "ms": "^2.1.1" + } + }, "node_modules/peek-readable": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-4.1.0.tgz", diff --git a/server/package.json b/server/package.json index 8a0bdb6..31c315e 100644 --- a/server/package.json +++ b/server/package.json @@ -29,6 +29,7 @@ "mongoose": "^8.2.3", "multer": "^1.4.5-lts.1", "nodemon": "^3.1.0", + "pdf-parse": "^1.1.1", "serverless": "^3.38.0", "serverless-http": "^3.2.0", "supertest": "^6.3.4", diff --git a/server/src/utils/extractPDF.js b/server/src/utils/extractPDF.js new file mode 100644 index 0000000..690a215 --- /dev/null +++ b/server/src/utils/extractPDF.js @@ -0,0 +1,21 @@ +import pdf from "pdf-parse"; +import axios from "axios"; +export const getResumeFromS3 = async (resume_url) => { + const pdfUrl = resume_url; + + const response = await axios.get(pdfUrl, { responseType: "arraybuffer" }); + return response.data; +}; + +const parsePdf = async (filePath) => { + try { + const dataBuffer = await getResumeFromS3(filePath); + const data = await pdf(dataBuffer); + + return data.text; + } catch (error) { + console.error("Error parsing PDF:", error); + } +}; + +export default parsePdf;