From 614ee24a7daafec37b31a8d663d18b68e7bf9805 Mon Sep 17 00:00:00 2001 From: mmz-001 <70096033+mmz-001@users.noreply.github.com> Date: Sat, 8 Jul 2023 12:11:11 +0530 Subject: [PATCH] use pymupdf for faster pdf parsing --- knowledge_gpt/core/parsing.py | 11 +++++--- poetry.lock | 49 +++++++++++++++++++++++++---------- pyproject.toml | 2 +- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/knowledge_gpt/core/parsing.py b/knowledge_gpt/core/parsing.py index c7101657..b16129be 100644 --- a/knowledge_gpt/core/parsing.py +++ b/knowledge_gpt/core/parsing.py @@ -4,7 +4,7 @@ import docx2txt from langchain.docstore.document import Document -from pypdf import PdfReader +import fitz from hashlib import md5 from abc import abstractmethod, ABC @@ -69,14 +69,17 @@ def from_bytes(cls, file: BytesIO) -> "DocxFile": class PdfFile(File): @classmethod def from_bytes(cls, file: BytesIO) -> "PdfFile": - pdf = PdfReader(file) + pdf = fitz.open(stream=file.read(), filetype="pdf") # type: ignore docs = [] - for i, page in enumerate(pdf.pages): - text = page.extract_text() + for i, page in enumerate(pdf): + text = page.get_text(sort=True) text = strip_consecutive_newlines(text) doc = Document(page_content=text.strip()) doc.metadata["page"] = i + 1 docs.append(doc) + # file.read() mutates the file object, which can affect caching + # so we need to reset the file pointer to the beginning + file.seek(0) return cls(name=file.name, id=md5(file.read()).hexdigest(), docs=docs) diff --git a/poetry.lock b/poetry.lock index 5dec5674..8a6894ac 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2069,23 +2069,44 @@ files = [ ] [[package]] -name = "pypdf" -version = "3.3.0" -description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +name = "pymupdf" +version = "1.22.5" +description = "Python bindings for the PDF toolkit and renderer MuPDF" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pypdf-3.3.0-py3-none-any.whl", hash = "sha256:4239eff2e3d5e5d90b88590e5924cb34e7743ec1418717749a16cc9032c37f77"}, - {file = "pypdf-3.3.0.tar.gz", hash = "sha256:37a272f6dd7d9a0eba29ae201d07d3060f8cb81d301e21f42d99e62d4c4e6e4b"}, + {file = "PyMuPDF-1.22.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:640b8e4cb116dd87a3c854e49808a4f63625e663a7bc5b1efc971db5b4775367"}, + {file = "PyMuPDF-1.22.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:17efbbf0e2d99d24cfc302fac512928eb294f10b7b67d597d04dafd012812e4e"}, + {file = "PyMuPDF-1.22.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9bc9b9bf0f2beea3911750d2d66247608be8cbad33b7a050cacec9e4c105a1ca"}, + {file = "PyMuPDF-1.22.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7734a32a91eea4b502b8f9d2915cdba0a372226e14fb983876d763110dcefef"}, + {file = "PyMuPDF-1.22.5-cp310-cp310-win32.whl", hash = "sha256:c2fd70ca9961f7871810dce1b7d0a42a69eb8ff2d786621123952bd505a6867e"}, + {file = "PyMuPDF-1.22.5-cp310-cp310-win_amd64.whl", hash = "sha256:add310c96df6933cfb4ce3821c9c7b5c133e8aa609a4c9416e1c7af546163488"}, + {file = "PyMuPDF-1.22.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:017aaba511526facfc928e9d95d2c10d28a2821b05b9039bf422031a7da8584e"}, + {file = "PyMuPDF-1.22.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6fe5e44a14864d921fb96669a82f9635846806176f77f1d73c61feb84ebf4d84"}, + {file = "PyMuPDF-1.22.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e74d766f79e41e10c51865233042ab2cc4612ca7942812dca0603f4d0f8f73d"}, + {file = "PyMuPDF-1.22.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe8175452fcc99a0af6429d8acd87682a3a70c5879d73532c7327f71ce508a35"}, + {file = "PyMuPDF-1.22.5-cp311-cp311-win32.whl", hash = "sha256:42f59f4999d7f8b35c850050bd965e98c081a7d9b92d5f9dcf30203b30d06876"}, + {file = "PyMuPDF-1.22.5-cp311-cp311-win_amd64.whl", hash = "sha256:3d71c47aa14b73f2df7d03be8c547a05df6c6898d8c63a0f752b26f206eefd3c"}, + {file = "PyMuPDF-1.22.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4bcad7ea4b3ab82c46fe8da27ec738d38c213ed9935ef67d98ed09574d9a234e"}, + {file = "PyMuPDF-1.22.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b04a83ddcb3f7c935c75a1f7f6050c85fe4062a2ea64c47ee6bda788d037761"}, + {file = "PyMuPDF-1.22.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d02ee28663077f15d529b04d27588b174fa937daf73a294df279bbf70c468f5c"}, + {file = "PyMuPDF-1.22.5-cp37-cp37m-win32.whl", hash = "sha256:411fc35f6dae16ec940b6b0406e84be6ff29f93b30908ea1427e2a4bd594d4ba"}, + {file = "PyMuPDF-1.22.5-cp37-cp37m-win_amd64.whl", hash = "sha256:7c8c0f686865e330de90b93d53b100f7f07c2f10f5449ceb721121f459f7cc4a"}, + {file = "PyMuPDF-1.22.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:64ae9f81b8fe0a3e6386a24887a92736793479c5918ecac3b7deac2d02abf1f2"}, + {file = "PyMuPDF-1.22.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7562436dadf8382e59ac3739fbbf9d5b2d807fafc7f28cb884863430e0de6505"}, + {file = "PyMuPDF-1.22.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c22046e5f2cf0d72f9809a967340db1b238fefe58322896bc7c3f3d1d10b42"}, + {file = "PyMuPDF-1.22.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efa601dc4116c17a6b09255b031b5a1891e3ac18b50ec536452a725a6b75db8d"}, + {file = "PyMuPDF-1.22.5-cp38-cp38-win32.whl", hash = "sha256:3d0fe749e648f5245059d5f771fb50c1a988a1d2e82268b56377b2176a9fee5d"}, + {file = "PyMuPDF-1.22.5-cp38-cp38-win_amd64.whl", hash = "sha256:4fbc5bfe6ecc53929e3fd0db9846fb7da084ddb4b1fc1063857245fa783974d9"}, + {file = "PyMuPDF-1.22.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:87b36e0797ab7fbb7ef594c7a6e0febc7ffb4101a42ea796726a8288391a3769"}, + {file = "PyMuPDF-1.22.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:01119edb7e4c3dd8c154d237b8ac927bd359eea8d31468f9a89aa308b5bca04e"}, + {file = "PyMuPDF-1.22.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fde02fcb387863873b56730f4b9f65515d87c92c12299f0f0a74b3ccdfe35062"}, + {file = "PyMuPDF-1.22.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30c55814bbf6461aef9b34cb524d1d14857d5ec6ccfbb78ecfb1d07dfc40eeb8"}, + {file = "PyMuPDF-1.22.5-cp39-cp39-win32.whl", hash = "sha256:0542178c3a399282903705a8cc298e7f33f4770605e0a9db344aff5d375bcf0b"}, + {file = "PyMuPDF-1.22.5-cp39-cp39-win_amd64.whl", hash = "sha256:f8ca46a6987e14f58ec8dfda2d2376bacd113c1fec5f58bebf90838bb4408ab9"}, + {file = "PyMuPDF-1.22.5.tar.gz", hash = "sha256:5ec8d5106752297529d0d68d46cfc4ce99914aabd99be843f1599a1842d63fe9"}, ] -[package.extras] -crypto = ["PyCryptodome"] -dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] -docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] -full = ["Pillow", "PyCryptodome"] -image = ["Pillow"] - [[package]] name = "pyrsistent" version = "0.19.3" @@ -3052,4 +3073,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "a5900883c31083d754c1865a3d8e3a42d9d322b82d2abbb8fa5cfaa81e5e6878" +content-hash = "c13766dab3db88be40f9ddb2e0b350d3c573938571457e484a52ff4847c9c963" diff --git a/pyproject.toml b/pyproject.toml index acbd33b4..a45acee1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,10 +16,10 @@ faiss-cpu = "^1.7.3" openai = "^0.27.8" docx2txt = "^0.8" pillow = "^9.4.0" -pypdf = "^3.3.0" tenacity = "^8.2.0" tiktoken = "^0.4.0" pycryptodome = "^3.18.0" +pymupdf = "^1.22.5" [tool.poetry.group.dev.dependencies]