Skip to content

Commit

Permalink
passing ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
braceal committed Sep 13, 2024
1 parent 57d467b commit 8234c18
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 62 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ We provide example configurations for each parser in these files:
- **nougat**: [examples/nougat/nougat_test.yaml](examples/nougat/nougat_test.yaml)
- **oreo**: [examples/oreo/oreo_test.yaml](examples/oreo/oreo_test.yaml)
- **pymupdf**: [examples/pymupdf/pymupdf_test.yaml](examples/pymupdf/pymupdf_test.yaml)
**Note**: Please see the comments in the example YAML files for **documentation
on the settings**.
Expand Down Expand Up @@ -241,7 +241,7 @@ conda create -n pymupdf-wf python=3.10 -y
conda activate pymupdf-wf
pip install -r requirements/pymupdf_requirements.txt
```
to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`.
to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`.
Both tools are lightweight and operational from the same conda environment.

## `pypdf` Pipeline Installation
Expand Down
13 changes: 10 additions & 3 deletions pdfwf/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,18 @@
from pdfwf.parsers.pymupdf import PyMuPDFParserConfig
from pdfwf.parsers.pypdf import PyPDFParser
from pdfwf.parsers.pypdf import PyPDFParserConfig

from pdfwf.registry import registry

ParserConfigTypes = MarkerParserConfig | OreoParserConfig | NougatParserConfig | PyMuPDFParserConfig | PyPDFParserConfig
ParserTypes = MarkerParser| NougatParser | OreoParser | PyMuPDFParser | PyPDFParser
ParserConfigTypes = (
MarkerParserConfig
| OreoParserConfig
| NougatParserConfig
| PyMuPDFParserConfig
| PyPDFParserConfig
)
ParserTypes = (
MarkerParser | NougatParser | OreoParser | PyMuPDFParser | PyPDFParser
)

_ParserTypes = tuple[type[ParserConfigTypes], type[ParserTypes]]

Expand Down
61 changes: 30 additions & 31 deletions pdfwf/parsers/pymupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@

from __future__ import annotations

import fitz
import re

from typing import Any
from typing import Literal

import fitz

from pdfwf.parsers.base import BaseParser
from pdfwf.parsers.base import BaseParserConfig
from pdfwf.utils import exception_handler


__all__ = [
'PyMuPDFParser',
'PyMuPDFParserConfig',
Expand All @@ -36,57 +35,58 @@ def __init__(self, config: PyMuPDFParserConfig) -> None:
"""Initialize the marker parser."""
self.config = config
self.abstract_threshold = 580

def extract_doi_info(self, input_str:str) -> str:
"""
Extracts doi from PyMUPDF metadata entry (if present)
"""
match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)

def extract_doi_info(self, input_str: str) -> str:
"""Extract doi from PyMUPDF metadata entry (if present)."""
match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
if match:
return match.group(2)
else:
return ''

def convert_single_pdf(self, pdf_path) -> str:
"""Wraps PyMuPDF functionality"""
def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
"""Wrap PyMuPDF functionality."""
# open pdf
doc = fitz.open(pdf_path)

# scrape text
text_list = []
for page in doc:
text_list.append(page.get_text())
full_text = "\n".join(text_list)
full_text = '\n'.join(text_list)

# get first page (asa proxy for `abstract`)
first_page_text = text_list[0] if len(text_list) > 0 else ''

# metadata (available to PyMuPDF)
title = doc.metadata.get('title', '')
authors = doc.metadata.get('author', '')
createdate = doc.metadata.get('creationDate', '')
keywords = doc.metadata.get('keywords', '')
doi = self.extract_doi_info(doc.metadata.get('subject', ''))
doi = self.extract_doi_info(doc.metadata.get('subject', ''))
prod = doc.metadata.get('producer', '')
form = doc.metadata.get('format', '')
abstract = doc.metadata.get('subject', '') if len(doc.metadata.get('subject', '')) > self.abstract_threshold else ''
abstract = (
doc.metadata.get('subject', '')
if len(doc.metadata.get('subject', '')) > self.abstract_threshold
else ''
)

# - assemble
out_meta = {'title' : title,
'authors' : authors,
'creationdate' : createdate,
'keywords' : keywords,
'doi' : doi,
'producer' : prod,
'format' : form,
'first_page' : first_page_text,
'abstract' : abstract,
out_meta = {
'title': title,
'authors': authors,
'creationdate': createdate,
'keywords': keywords,
'doi': doi,
'producer': prod,
'format': form,
'first_page': first_page_text,
'abstract': abstract,
}

# full text & metadata entries
output = full_text, out_meta

return output

# full text & metadata entries
return full_text, out_meta

@exception_handler(default_return=None)
def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
Expand All @@ -103,7 +103,6 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
A tuple containing the full text of the PDF and the metadata
extracted from the PDF. If parsing fails, return None.
"""

full_text, out_meta = self.convert_single_pdf(pdf_path)

return full_text, out_meta
Expand Down
63 changes: 37 additions & 26 deletions pdfwf/parsers/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@

from __future__ import annotations

from pypdf import PdfReader
import re
import logging

import re
from typing import Any
from typing import Literal

from pypdf import PdfReader

from pdfwf.parsers.base import BaseParser
from pdfwf.parsers.base import BaseParserConfig
from pdfwf.utils import exception_handler


__all__ = [
'PyPDFParser',
'PyPDFParserConfig',
Expand Down Expand Up @@ -41,49 +40,62 @@ def __init__(self, config: PyPDFParserConfig) -> None:
# pypdf is verbose
logging.getLogger().setLevel(logging.ERROR)

def extract_doi_info(self, input_str:str) -> str:
"""
Extracts doi from pypdf metadata entry (if present)
"""
def extract_doi_info(self, input_str: str) -> str:
"""Extract doi from pypdf metadata entry (if present)."""
match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
if match:
return match.group(2)
else:
return ''

def convert_single_pdf(self, pdf_path) -> str:
"""Wraps pypdf functionality"""
def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
"""Wrap pypdf functionality."""
# open
reader = PdfReader(pdf_path)

# scrape text
full_text=''
full_text = ''
for page in reader.pages:
full_text += page.extract_text(extraction_mode="layout")
full_text += page.extract_text(extraction_mode='layout')

first_page_text = reader.pages[0].extract_text(extraction_mode="layout") if len(reader.pages[0]) > 0 else ''
first_page_text = (
reader.pages[0].extract_text(extraction_mode='layout')
if len(reader.pages[0]) > 0
else ''
)
meta = reader.metadata

# metadata (available to pypdf)
title = meta.get('/Title', '')
authors = meta.get('/Author', '')
createdate = meta.get('/CreationDate', '')
keywords = meta.get('/Keywords', '')
doi = meta.get('/doi', '') if meta.get('/doi', '')!='' else self.extract_doi_info(meta.get('/Subject', '')) # Use .get() to handle the missing DOI key
doi = (
meta.get('/doi', '')
if meta.get('/doi', '') != ''
else self.extract_doi_info(meta.get('/Subject', ''))
) # Use .get() to handle the missing DOI key
prod = meta.get('/Producer', '')
form = meta.get('/Format', '') # Not included for pypdf, so we set it directly
abstract = meta.get('/Subject', '') if len(meta.get('/Subject', '')) > self.abstract_threshold else ''
form = meta.get(
'/Format', ''
) # Not included for pypdf, so we set it directly
abstract = (
meta.get('/Subject', '')
if len(meta.get('/Subject', '')) > self.abstract_threshold
else ''
)

# - assemble
out_meta = {'title' : title,
'authors' : authors,
'createdate' : createdate,
'keywords' : keywords,
'doi' : doi,
'producer' : prod,
'format' : form,
'first_page' : first_page_text,
'abstract' : abstract,
out_meta = {
'title': title,
'authors': authors,
'createdate': createdate,
'keywords': keywords,
'doi': doi,
'producer': prod,
'format': form,
'first_page': first_page_text,
'abstract': abstract,
}

# full text & metadata entries
Expand All @@ -106,7 +118,6 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
A tuple containing the full text of the PDF and the metadata
extracted from the PDF. If parsing fails, return None.
"""

full_text, out_meta = self.convert_single_pdf(pdf_path)

return full_text, out_meta
Expand Down

0 comments on commit 8234c18

Please sign in to comment.