diff --git a/pdfwf/parsers/pymupdf.py b/pdfwf/parsers/pymupdf.py index c0071c2..cf172e3 100644 --- a/pdfwf/parsers/pymupdf.py +++ b/pdfwf/parsers/pymupdf.py @@ -26,10 +26,7 @@ class PyMuPDFParserConfig(BaseParserConfig): class PyMuPDFParser(BaseParser): - """Warmstart interface for the PyMuPDF PDF parser. - - No warmsart eneded as PyMuPDF is a Python library using CPUs only - """ + """Interface for the PyMuPDF PDF parser.""" def __init__(self, config: PyMuPDFParserConfig) -> None: """Initialize the marker parser.""" @@ -39,26 +36,36 @@ def __init__(self, config: PyMuPDFParserConfig) -> None: def extract_doi_info(self, input_str: str) -> str: """Extract doi from PyMUPDF metadata entry (if present).""" match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) - if match: - return match.group(2) - else: - return '' - - def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: - """Wrap PyMuPDF functionality.""" - # open pdf + return match.group(2) if match else '' + + @exception_handler(default_return=None) + def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: + """Parse a PDF file. + + Parameters + ---------- + pdf_path : str + Path to the PDF file to convert. + + Returns + ------- + tuple[str, dict[str, str]] | None + A tuple containing the full text of the PDF and the metadata + extracted from the PDF. If parsing fails, return None. + """ + # Open pdf doc = fitz.open(pdf_path) - # scrape text + # Scrape text text_list = [] for page in doc: text_list.append(page.get_text()) full_text = '\n'.join(text_list) - # get first page (asa proxy for `abstract`) + # Get first page (as a proxy for `abstract`) first_page_text = text_list[0] if len(text_list) > 0 else '' - # metadata (available to PyMuPDF) + # Metadata (available to PyMuPDF) title = doc.metadata.get('title', '') authors = doc.metadata.get('author', '') createdate = doc.metadata.get('creationDate', '') @@ -72,7 +79,7 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: else '' ) - # - assemble + # Assemble the metadata out_meta = { 'title': title, 'authors': authors, @@ -85,26 +92,10 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: 'abstract': abstract, } - # full text & metadata entries - return full_text, out_meta - - @exception_handler(default_return=None) - def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: - """Parse a PDF file and extract markdown. - - Parameters - ---------- - pdf_path : str - Path to the PDF file to convert. - - Returns - ------- - tuple[str, dict[str, str]] | None - A tuple containing the full text of the PDF and the metadata - extracted from the PDF. If parsing fails, return None. - """ - full_text, out_meta = self.convert_single_pdf(pdf_path) + # TODO: Should we close the document? + # doc.close() + # full text & metadata entries return full_text, out_meta @exception_handler(default_return=None) diff --git a/pdfwf/parsers/pypdf.py b/pdfwf/parsers/pypdf.py index 03f6ec9..266f043 100644 --- a/pdfwf/parsers/pypdf.py +++ b/pdfwf/parsers/pypdf.py @@ -43,17 +43,28 @@ def __init__(self, config: PyPDFParserConfig) -> None: def extract_doi_info(self, input_str: str) -> str: """Extract doi from pypdf metadata entry (if present).""" match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) - if match: - return match.group(2) - else: - return '' - - def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: - """Wrap pypdf functionality.""" - # open + return match.group(2) if match else '' + + @exception_handler(default_return=None) + def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: + """Parse a PDF file. + + Parameters + ---------- + pdf_path : str + Path to the PDF file to convert. + + Returns + ------- + tuple[str, dict[str, str]] | None + A tuple containing the full text of the PDF and the metadata + extracted from the PDF. If parsing fails, return None. + """ + # TODO: This needs to be closed + # Open reader = PdfReader(pdf_path) - # scrape text + # Scrape text full_text = '' for page in reader.pages: full_text += page.extract_text(extraction_mode='layout') @@ -65,27 +76,27 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: ) meta = reader.metadata - # metadata (available to pypdf) + # Metadata (available to pypdf) title = meta.get('/Title', '') authors = meta.get('/Author', '') createdate = meta.get('/CreationDate', '') keywords = meta.get('/Keywords', '') + # Use .get() to handle the missing DOI key doi = ( meta.get('/doi', '') if meta.get('/doi', '') != '' else self.extract_doi_info(meta.get('/Subject', '')) - ) # Use .get() to handle the missing DOI key + ) prod = meta.get('/Producer', '') - form = meta.get( - '/Format', '' - ) # Not included for pypdf, so we set it directly + # Not included for pypdf, so we set it directly + form = meta.get('/Format', '') abstract = ( meta.get('/Subject', '') if len(meta.get('/Subject', '')) > self.abstract_threshold else '' ) - # - assemble + # Assemble the metadata out_meta = { 'title': title, 'authors': authors, @@ -98,28 +109,6 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]: 'abstract': abstract, } - # full text & metadata entries - output = full_text, out_meta - - return output - - @exception_handler(default_return=None) - def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: - """Parse a PDF file and extract markdown. - - Parameters - ---------- - pdf_path : str - Path to the PDF file to convert. - - Returns - ------- - tuple[str, dict[str, str]] | None - A tuple containing the full text of the PDF and the metadata - extracted from the PDF. If parsing fails, return None. - """ - full_text, out_meta = self.convert_single_pdf(pdf_path) - return full_text, out_meta @exception_handler(default_return=None)