Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
braceal committed Sep 13, 2024
1 parent 8234c18 commit aa7cf20
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 72 deletions.
61 changes: 26 additions & 35 deletions pdfwf/parsers/pymupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,7 @@ class PyMuPDFParserConfig(BaseParserConfig):


class PyMuPDFParser(BaseParser):
"""Warmstart interface for the PyMuPDF PDF parser.
No warmsart eneded as PyMuPDF is a Python library using CPUs only
"""
"""Interface for the PyMuPDF PDF parser."""

def __init__(self, config: PyMuPDFParserConfig) -> None:
"""Initialize the marker parser."""
Expand All @@ -39,26 +36,36 @@ def __init__(self, config: PyMuPDFParserConfig) -> None:
def extract_doi_info(self, input_str: str) -> str:
"""Extract doi from PyMUPDF metadata entry (if present)."""
match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
if match:
return match.group(2)
else:
return ''

def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
"""Wrap PyMuPDF functionality."""
# open pdf
return match.group(2) if match else ''

@exception_handler(default_return=None)
def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
"""Parse a PDF file.
Parameters
----------
pdf_path : str
Path to the PDF file to convert.
Returns
-------
tuple[str, dict[str, str]] | None
A tuple containing the full text of the PDF and the metadata
extracted from the PDF. If parsing fails, return None.
"""
# Open pdf
doc = fitz.open(pdf_path)

# scrape text
# Scrape text
text_list = []
for page in doc:
text_list.append(page.get_text())
full_text = '\n'.join(text_list)

# get first page (asa proxy for `abstract`)
# Get first page (as a proxy for `abstract`)
first_page_text = text_list[0] if len(text_list) > 0 else ''

# metadata (available to PyMuPDF)
# Metadata (available to PyMuPDF)
title = doc.metadata.get('title', '')
authors = doc.metadata.get('author', '')
createdate = doc.metadata.get('creationDate', '')
Expand All @@ -72,7 +79,7 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
else ''
)

# - assemble
# Assemble the metadata
out_meta = {
'title': title,
'authors': authors,
Expand All @@ -85,26 +92,10 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
'abstract': abstract,
}

# full text & metadata entries
return full_text, out_meta

@exception_handler(default_return=None)
def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
"""Parse a PDF file and extract markdown.
Parameters
----------
pdf_path : str
Path to the PDF file to convert.
Returns
-------
tuple[str, dict[str, str]] | None
A tuple containing the full text of the PDF and the metadata
extracted from the PDF. If parsing fails, return None.
"""
full_text, out_meta = self.convert_single_pdf(pdf_path)
# TODO: Should we close the document?
# doc.close()

# full text & metadata entries
return full_text, out_meta

@exception_handler(default_return=None)
Expand Down
63 changes: 26 additions & 37 deletions pdfwf/parsers/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,28 @@ def __init__(self, config: PyPDFParserConfig) -> None:
def extract_doi_info(self, input_str: str) -> str:
"""Extract doi from pypdf metadata entry (if present)."""
match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
if match:
return match.group(2)
else:
return ''

def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
"""Wrap pypdf functionality."""
# open
return match.group(2) if match else ''

@exception_handler(default_return=None)
def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
"""Parse a PDF file.
Parameters
----------
pdf_path : str
Path to the PDF file to convert.
Returns
-------
tuple[str, dict[str, str]] | None
A tuple containing the full text of the PDF and the metadata
extracted from the PDF. If parsing fails, return None.
"""
# TODO: This needs to be closed
# Open
reader = PdfReader(pdf_path)

# scrape text
# Scrape text
full_text = ''
for page in reader.pages:
full_text += page.extract_text(extraction_mode='layout')
Expand All @@ -65,27 +76,27 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
)
meta = reader.metadata

# metadata (available to pypdf)
# Metadata (available to pypdf)
title = meta.get('/Title', '')
authors = meta.get('/Author', '')
createdate = meta.get('/CreationDate', '')
keywords = meta.get('/Keywords', '')
# Use .get() to handle the missing DOI key
doi = (
meta.get('/doi', '')
if meta.get('/doi', '') != ''
else self.extract_doi_info(meta.get('/Subject', ''))
) # Use .get() to handle the missing DOI key
)
prod = meta.get('/Producer', '')
form = meta.get(
'/Format', ''
) # Not included for pypdf, so we set it directly
# Not included for pypdf, so we set it directly
form = meta.get('/Format', '')
abstract = (
meta.get('/Subject', '')
if len(meta.get('/Subject', '')) > self.abstract_threshold
else ''
)

# - assemble
# Assemble the metadata
out_meta = {
'title': title,
'authors': authors,
Expand All @@ -98,28 +109,6 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
'abstract': abstract,
}

# full text & metadata entries
output = full_text, out_meta

return output

@exception_handler(default_return=None)
def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
"""Parse a PDF file and extract markdown.
Parameters
----------
pdf_path : str
Path to the PDF file to convert.
Returns
-------
tuple[str, dict[str, str]] | None
A tuple containing the full text of the PDF and the metadata
extracted from the PDF. If parsing fails, return None.
"""
full_text, out_meta = self.convert_single_pdf(pdf_path)

return full_text, out_meta

@exception_handler(default_return=None)
Expand Down

0 comments on commit aa7cf20

Please sign in to comment.