diff --git a/README.md b/README.md index cba857a..3d56f41 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ We provide example configurations for each parser in these files: - **nougat**: [examples/nougat/nougat_test.yaml](examples/nougat/nougat_test.yaml) - **oreo**: [examples/oreo/oreo_test.yaml](examples/oreo/oreo_test.yaml) - **pymupdf**: [examples/pymupdf/pymupdf_test.yaml](examples/pymupdf/pymupdf_test.yaml) - + **Note**: Please see the comments in the example YAML files for **documentation on the settings**. @@ -241,7 +241,7 @@ conda create -n pymupdf-wf python=3.10 -y conda activate pymupdf-wf pip install -r requirements/pymupdf_requirements.txt ``` -to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`. +to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`. Both tools are lightweight and operational from the same conda environment. ## `pypdf` Pipeline Installation diff --git a/pdfwf/parsers/__init__.py b/pdfwf/parsers/__init__.py index 41f9f3f..b499bbd 100644 --- a/pdfwf/parsers/__init__.py +++ b/pdfwf/parsers/__init__.py @@ -16,11 +16,18 @@ from pdfwf.parsers.pymupdf import PyMuPDFParserConfig from pdfwf.parsers.pypdf import PyPDFParser from pdfwf.parsers.pypdf import PyPDFParserConfig - from pdfwf.registry import registry -ParserConfigTypes = MarkerParserConfig | OreoParserConfig | NougatParserConfig | PyMuPDFParserConfig | PyPDFParserConfig -ParserTypes = MarkerParser| NougatParser | OreoParser | PyMuPDFParser | PyPDFParser +ParserConfigTypes = ( + MarkerParserConfig + | OreoParserConfig + | NougatParserConfig + | PyMuPDFParserConfig + | PyPDFParserConfig +) +ParserTypes = ( + MarkerParser | NougatParser | OreoParser | PyMuPDFParser | PyPDFParser +) _ParserTypes = tuple[type[ParserConfigTypes], type[ParserTypes]] diff --git a/pdfwf/parsers/pymupdf.py b/pdfwf/parsers/pymupdf.py index 0987345..c0071c2 100644 --- a/pdfwf/parsers/pymupdf.py +++ b/pdfwf/parsers/pymupdf.py @@ -2,17 +2,16 @@ from __future__ import annotations -import fitz import re - from typing import Any from typing import Literal +import fitz + from pdfwf.parsers.base import BaseParser from pdfwf.parsers.base import BaseParserConfig from pdfwf.utils import exception_handler - __all__ = [ 'PyMuPDFParser', 'PyMuPDFParserConfig', @@ -36,19 +35,17 @@ def __init__(self, config: PyMuPDFParserConfig) -> None: """Initialize the marker parser.""" self.config = config self.abstract_threshold = 580 - - def extract_doi_info(self, input_str:str) -> str: - """ - Extracts doi from PyMUPDF metadata entry (if present) - """ - match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) + + def extract_doi_info(self, input_str: str) -> str: + """Extract doi from PyMUPDF metadata entry (if present).""" + match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) if match: return match.group(2) else: return '' - def convert_single_pdf(self, pdf_path) -> str: - """Wraps PyMuPDF functionality""" + def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: + """Wrap PyMuPDF functionality.""" # open pdf doc = fitz.open(pdf_path) @@ -56,37 +53,40 @@ def convert_single_pdf(self, pdf_path) -> str: text_list = [] for page in doc: text_list.append(page.get_text()) - full_text = "\n".join(text_list) - + full_text = '\n'.join(text_list) + # get first page (asa proxy for `abstract`) first_page_text = text_list[0] if len(text_list) > 0 else '' - + # metadata (available to PyMuPDF) title = doc.metadata.get('title', '') authors = doc.metadata.get('author', '') createdate = doc.metadata.get('creationDate', '') keywords = doc.metadata.get('keywords', '') - doi = self.extract_doi_info(doc.metadata.get('subject', '')) + doi = self.extract_doi_info(doc.metadata.get('subject', '')) prod = doc.metadata.get('producer', '') form = doc.metadata.get('format', '') - abstract = doc.metadata.get('subject', '') if len(doc.metadata.get('subject', '')) > self.abstract_threshold else '' + abstract = ( + doc.metadata.get('subject', '') + if len(doc.metadata.get('subject', '')) > self.abstract_threshold + else '' + ) # - assemble - out_meta = {'title' : title, - 'authors' : authors, - 'creationdate' : createdate, - 'keywords' : keywords, - 'doi' : doi, - 'producer' : prod, - 'format' : form, - 'first_page' : first_page_text, - 'abstract' : abstract, + out_meta = { + 'title': title, + 'authors': authors, + 'creationdate': createdate, + 'keywords': keywords, + 'doi': doi, + 'producer': prod, + 'format': form, + 'first_page': first_page_text, + 'abstract': abstract, } - - # full text & metadata entries - output = full_text, out_meta - - return output + + # full text & metadata entries + return full_text, out_meta @exception_handler(default_return=None) def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: @@ -103,7 +103,6 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: A tuple containing the full text of the PDF and the metadata extracted from the PDF. If parsing fails, return None. """ - full_text, out_meta = self.convert_single_pdf(pdf_path) return full_text, out_meta diff --git a/pdfwf/parsers/pypdf.py b/pdfwf/parsers/pypdf.py index 9a0e76b..03f6ec9 100644 --- a/pdfwf/parsers/pypdf.py +++ b/pdfwf/parsers/pypdf.py @@ -2,18 +2,17 @@ from __future__ import annotations -from pypdf import PdfReader -import re import logging - +import re from typing import Any from typing import Literal +from pypdf import PdfReader + from pdfwf.parsers.base import BaseParser from pdfwf.parsers.base import BaseParserConfig from pdfwf.utils import exception_handler - __all__ = [ 'PyPDFParser', 'PyPDFParserConfig', @@ -41,27 +40,29 @@ def __init__(self, config: PyPDFParserConfig) -> None: # pypdf is verbose logging.getLogger().setLevel(logging.ERROR) - def extract_doi_info(self, input_str:str) -> str: - """ - Extracts doi from pypdf metadata entry (if present) - """ + def extract_doi_info(self, input_str: str) -> str: + """Extract doi from pypdf metadata entry (if present).""" match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) if match: return match.group(2) else: return '' - def convert_single_pdf(self, pdf_path) -> str: - """Wraps pypdf functionality""" + def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: + """Wrap pypdf functionality.""" # open reader = PdfReader(pdf_path) # scrape text - full_text='' + full_text = '' for page in reader.pages: - full_text += page.extract_text(extraction_mode="layout") + full_text += page.extract_text(extraction_mode='layout') - first_page_text = reader.pages[0].extract_text(extraction_mode="layout") if len(reader.pages[0]) > 0 else '' + first_page_text = ( + reader.pages[0].extract_text(extraction_mode='layout') + if len(reader.pages[0]) > 0 + else '' + ) meta = reader.metadata # metadata (available to pypdf) @@ -69,21 +70,32 @@ def convert_single_pdf(self, pdf_path) -> str: authors = meta.get('/Author', '') createdate = meta.get('/CreationDate', '') keywords = meta.get('/Keywords', '') - doi = meta.get('/doi', '') if meta.get('/doi', '')!='' else self.extract_doi_info(meta.get('/Subject', '')) # Use .get() to handle the missing DOI key + doi = ( + meta.get('/doi', '') + if meta.get('/doi', '') != '' + else self.extract_doi_info(meta.get('/Subject', '')) + ) # Use .get() to handle the missing DOI key prod = meta.get('/Producer', '') - form = meta.get('/Format', '') # Not included for pypdf, so we set it directly - abstract = meta.get('/Subject', '') if len(meta.get('/Subject', '')) > self.abstract_threshold else '' + form = meta.get( + '/Format', '' + ) # Not included for pypdf, so we set it directly + abstract = ( + meta.get('/Subject', '') + if len(meta.get('/Subject', '')) > self.abstract_threshold + else '' + ) # - assemble - out_meta = {'title' : title, - 'authors' : authors, - 'createdate' : createdate, - 'keywords' : keywords, - 'doi' : doi, - 'producer' : prod, - 'format' : form, - 'first_page' : first_page_text, - 'abstract' : abstract, + out_meta = { + 'title': title, + 'authors': authors, + 'createdate': createdate, + 'keywords': keywords, + 'doi': doi, + 'producer': prod, + 'format': form, + 'first_page': first_page_text, + 'abstract': abstract, } # full text & metadata entries @@ -106,7 +118,6 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: A tuple containing the full text of the PDF and the metadata extracted from the PDF. If parsing fails, return None. """ - full_text, out_meta = self.convert_single_pdf(pdf_path) return full_text, out_meta