diff --git a/README.md b/README.md index 0bbfeb5..f8d7a65 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ to run on other HPC systems by adding an appropriate - [Install Marker](#marker-pipeline-installation) - [Install Nougat](#nougat-pipeline-installation) - [Install Oreo](#oreo-pipeline-installation) +- [Install PyMuPDF](#pymupdf-pipeline-installation) **Note**: You may need different virtual environments for each parser. @@ -74,7 +75,7 @@ compute_settings: # The number of compute nodes to use num_nodes: 1 # Make sure to update the path to your conda environment and HF cache - worker_init: "module load conda/2023-10-04; conda activate marker-wf; export HF_HOME=" + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=" # The scheduler options to use when submitting jobs scheduler_options: "#PBS -l filesystems=home:eagle:grand" # Make sure to change the account to the account you want to charge @@ -89,6 +90,7 @@ We provide example configurations for each parser in these files: - **marker**: [examples/marker/marker_test.yaml](examples/marker/marker_test.yaml) - **nougat**: [examples/nougat/nougat_test.yaml](examples/nougat/nougat_test.yaml) - **oreo**: [examples/oreo/oreo_test.yaml](examples/oreo/oreo_test.yaml) +- **pymupdf**: [examples/pymupdf/pymupdf_test.yaml](examples/pymupdf/pymupdf_test.yaml) **Note**: Please see the comments in the example YAML files for **documentation on the settings**. @@ -175,7 +177,7 @@ pip install pypdf On a compute node, run: ```bash # Create a conda environment with python3.10 -module load conda/2023-10-04 +module use /soft/modulefiles; module load conda/2024-04-29 conda create -n nougat-wf python=3.10 conda activate nougat-wf @@ -203,7 +205,7 @@ If `Nougat` inference ran successfully, proceed to install the `pdfwf` workflow ## `Oreo` Pipeline Installation On a compute node, run: ```bash -module load conda/2023-10-04 +module use /soft/modulefiles; module load conda/2024-04-29 conda create -n pdfwf python=3.10 -y conda activate pdfwf mamba install pytorch torchvision pytorch-cuda=11.8 -c pytorch -c nvidia -y @@ -213,6 +215,20 @@ git clone git@github.com:ultralytics/yolov5.git Then set the `yolov5_path` option to the path of the cloned `yolov5` repository. +## `PyMuPDF` Pipeline Installation +On any node, run: +```bash +module use /soft/modulefiles; module load conda/2024-04-29 +conda create -n pymupdf-wf python=3.10 -y +conda activate pymupdf-wf +pip install -r requirements/pymupdf_requirements.txt +``` +to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`. +Both tools are lightweight and operational from the same conda environment. + +## `pypdf` Pipeline Installation +Both PDF extraction tools `PyMuPDF` and `pypdf` are fairly lightweight and operate in the conda environment `pymupdf-wf`. + ## CLI For running smaller jobs without using the Parsl workflow, the CLI can be used. The CLI provides different commands for running the various parsers. The CLI diff --git a/examples/marker/marker_joint.yaml b/examples/marker/marker_joint.yaml new file mode 100644 index 0000000..3941d34 --- /dev/null +++ b/examples/marker/marker_joint.yaml @@ -0,0 +1,30 @@ +# The directory containing the pdfs to convert +pdf_dir: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint + +# The directory to place the converted pdfs in +out_dir: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_marker + +# Chunk size for pdf parsing +chunk_size: 100 + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: marker + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 10 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=/eagle/projects/argonne_tpc/siebenschuh/HF_cache" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: FoundEpidem + # The HPC queue to submit to + queue: debug-scaling + # The amount of time to request for your job + walltime: 00:60:00 diff --git a/examples/marker/marker_small_test.yaml b/examples/marker/marker_small_test.yaml new file mode 100644 index 0000000..ed5d1b7 --- /dev/null +++ b/examples/marker/marker_small_test.yaml @@ -0,0 +1,27 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set + +# The directory to place the converted pdfs in +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_marker + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: marker + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 1 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=/eagle/projects/argonne_tpc/siebenschuh/HF_cache" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: FoundEpidem + # The HPC queue to submit to + queue: debug + # The amount of time to request for your job + walltime: 00:30:00 diff --git a/examples/nougat/additional_merged_pdfs.yaml b/examples/nougat/additional_merged_pdfs.yaml new file mode 100644 index 0000000..b387500 --- /dev/null +++ b/examples/nougat/additional_merged_pdfs.yaml @@ -0,0 +1,47 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/additional_pdf_only_data + +# The output directory of the workflow +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/additional_pdf_only_data_to_nougat + +# Only convert the first 100 pdfs for testing +num_conversions: 25000 + +# The number of PDFs to parse in each parsl task +chunk_size: 5 + +parser_settings: + name: "nougat" + # Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB. + batchsize: 10 + # Path to download the checkpoint to. if already exists, will use existing. + checkpoint: /eagle/projects/argonne_tpc/siebenschuh/nougat-checkpoint + # Set mmd_out to null if you don't want to write mmd files as a byproduct. + mmd_out: /eagle/projects/argonne_tpc/siebenschuh/additional_merged_pdfs/mmd_output + # If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out + recompute: false + # Set to true if you want to use fp32 instead of bfloat16 (false is recommended) + full_precision: false + # If set to true, output text will be formatted as a markdown file. + markdown: true + # Preempt processing a paper if mode collapse causes repetitions (true is recommended) + skipping: true + # Path for the nougat-specific logs for the run. + nougat_logs_path: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/logs + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 10 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate nougat-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: FoundEpidem + # The HPC queue to submit to + queue: debug-scaling + # The amount of time to request for your job + walltime: 00:60:00 diff --git a/examples/nougat/all_merged_pdfs.yaml b/examples/nougat/all_merged_pdfs.yaml new file mode 100644 index 0000000..6a59fa4 --- /dev/null +++ b/examples/nougat/all_merged_pdfs.yaml @@ -0,0 +1,47 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/merged_pdf_only_data + +# The output directory of the workflow +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/merged_pdf_only_data_to_nougat + +# Only convert the first 100 pdfs for testing +num_conversions: 25000 + +# The number of PDFs to parse in each parsl task +chunk_size: 5 + +parser_settings: + name: "nougat" + # Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB. + batchsize: 10 + # Path to download the checkpoint to. if already exists, will use existing. + checkpoint: /eagle/projects/argonne_tpc/siebenschuh/nougat-checkpoint + # Set mmd_out to null if you don't want to write mmd files as a byproduct. + mmd_out: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/mmd_output + # If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out + recompute: false + # Set to true if you want to use fp32 instead of bfloat16 (false is recommended) + full_precision: false + # If set to true, output text will be formatted as a markdown file. + markdown: true + # Preempt processing a paper if mode collapse causes repetitions (true is recommended) + skipping: true + # Path for the nougat-specific logs for the run. + nougat_logs_path: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/logs + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 10 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate nougat-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: FoundEpidem + # The HPC queue to submit to + queue: debug-scaling + # The amount of time to request for your job + walltime: 00:60:00 diff --git a/examples/oreo/oreo_joint.yaml b/examples/oreo/oreo_joint.yaml new file mode 100644 index 0000000..ad4b333 --- /dev/null +++ b/examples/oreo/oreo_joint.yaml @@ -0,0 +1,61 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint + +# The output directory of the workflow +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_oreo + +# Only convert the first 100 pdfs for testing +num_conversions: 100 + +# The number of PDFs to parse in each parsl task +chunk_size: 4 + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use. + name: oreo + # Weights to layout detection model. + detection_weights_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/model_weights/yolov5_detection_weights.pt + # Model weights for (meta) text classifier. + text_cls_weights_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/text_classifier/meta_text_classifier + # Path to the SPV05 category file. + spv05_category_file_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/meta/spv05_categories.yaml + # Only scan PDFs for meta statistics on its attributes. + detect_only: false + # Only parse PDFs for meta data. + meta_only: false + # Include equations into the text categories + equation: false + # Include table visualizations (will be stored). + table: false + # Include figure (will be stored). + figure: false + # Include secondary meta data (footnote, headers). + secondary_meta: false + # If true, accelerate inference by packing non-meta text patches. + accelerate: false + # Main batch size for detection/# of images loaded per batch. + batch_yolo: 2 + # Batch size of pre-processed patches for ViT pseudo-OCR inference. + batch_vit: 2 + # Batch size K for subsequent text processing. + batch_cls: 2 + # Number of pixels along which. + bbox_offset: 2 + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 1 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pdfwf; export HF_HOME=/lus/eagle/projects/CVD-Mol-AI/braceal/cache/huggingface" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: FoundEpidem + # The HPC queue to submit to + queue: debug-scaling + # The amount of time to request for your job + walltime: 01:00:00 diff --git a/examples/pymupdf/pymupdf_joint.yaml b/examples/pymupdf/pymupdf_joint.yaml new file mode 100644 index 0000000..5d36a14 --- /dev/null +++ b/examples/pymupdf/pymupdf_joint.yaml @@ -0,0 +1,27 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint + +# The directory to place the converted pdfs in +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_pymupdf + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: pymupdf + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 1 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: argonne_tpc + # The HPC queue to submit to + queue: debug + # The amount of time to request for your job + walltime: 00:60:00 diff --git a/examples/pymupdf/pymupdf_small_test.yaml b/examples/pymupdf/pymupdf_small_test.yaml new file mode 100644 index 0000000..b05bc9e --- /dev/null +++ b/examples/pymupdf/pymupdf_small_test.yaml @@ -0,0 +1,27 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set + +# The directory to place the converted pdfs in +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pymupdf + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: pymupdf + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 1 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: FoundEpidem + # The HPC queue to submit to + queue: debug + # The amount of time to request for your job + walltime: 00:15:00 diff --git a/examples/pymupdf/pymupdf_test.yaml b/examples/pymupdf/pymupdf_test.yaml new file mode 100644 index 0000000..faf6867 --- /dev/null +++ b/examples/pymupdf/pymupdf_test.yaml @@ -0,0 +1,27 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set + +# The directory to place the converted pdfs in +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pymupdf + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: pymupdf + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 1 + # Make sure to update the path to your conda environment and HF cache + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: argonne_tpc + # The HPC queue to submit to + queue: debug + # The amount of time to request for your job + walltime: 00:15:00 diff --git a/examples/pypdf/pypdf_joint.yaml b/examples/pypdf/pypdf_joint.yaml new file mode 100644 index 0000000..85d4654 --- /dev/null +++ b/examples/pypdf/pypdf_joint.yaml @@ -0,0 +1,27 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint + +# The directory to place the converted pdfs in +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_pypdf + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: pypdf + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 10 + # Make sure to update the path to your conda environment (same conda env as for PyMuPDF!) + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: argonne_tpc + # The HPC queue to submit to + queue: debug-scaling + # The amount of time to request for your job + walltime: 00:60:00 diff --git a/examples/pypdf/pypdf_test.yaml b/examples/pypdf/pypdf_test.yaml new file mode 100644 index 0000000..1cb0315 --- /dev/null +++ b/examples/pypdf/pypdf_test.yaml @@ -0,0 +1,27 @@ +# The directory containing the pdfs to convert +pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set + +# The directory to place the converted pdfs in +out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pypdf + +# The settings for the pdf parser +parser_settings: + # The name of the parser to use + name: pypdf + +# The compute settings for the workflow +compute_settings: + # The name of the compute platform to use + name: polaris + # The number of compute nodes to use + num_nodes: 1 + # Make sure to update the path to your conda environment (same conda env as for PyMuPDF!) + worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf" + # The scheduler options to use when submitting jobs + scheduler_options: "#PBS -l filesystems=home:eagle" + # Make sure to change the account to the account you want to charge + account: argonne_tpc + # The HPC queue to submit to + queue: debug + # The amount of time to request for your job + walltime: 00:15:00 diff --git a/pdfwf/parsers/__init__.py b/pdfwf/parsers/__init__.py index cbd0057..b499bbd 100644 --- a/pdfwf/parsers/__init__.py +++ b/pdfwf/parsers/__init__.py @@ -12,10 +12,22 @@ from pdfwf.parsers.nougat_ import NougatParserConfig from pdfwf.parsers.oreo import OreoParser from pdfwf.parsers.oreo import OreoParserConfig +from pdfwf.parsers.pymupdf import PyMuPDFParser +from pdfwf.parsers.pymupdf import PyMuPDFParserConfig +from pdfwf.parsers.pypdf import PyPDFParser +from pdfwf.parsers.pypdf import PyPDFParserConfig from pdfwf.registry import registry -ParserConfigTypes = MarkerParserConfig | OreoParserConfig | NougatParserConfig -ParserTypes = MarkerParser | OreoParser | NougatParser +ParserConfigTypes = ( + MarkerParserConfig + | OreoParserConfig + | NougatParserConfig + | PyMuPDFParserConfig + | PyPDFParserConfig +) +ParserTypes = ( + MarkerParser | NougatParser | OreoParser | PyMuPDFParser | PyPDFParser +) _ParserTypes = tuple[type[ParserConfigTypes], type[ParserTypes]] @@ -23,6 +35,8 @@ 'marker': (MarkerParserConfig, MarkerParser), 'oreo': (OreoParserConfig, OreoParser), 'nougat': (NougatParserConfig, NougatParser), + 'pymupdf': (PyMuPDFParserConfig, PyMuPDFParser), + 'pypdf': (PyPDFParserConfig, PyPDFParser), } @@ -54,6 +68,8 @@ def get_parser( - marker - oreo - nougat + - pymupdf + - pypdf Parameters ---------- diff --git a/pdfwf/parsers/marker.py b/pdfwf/parsers/marker.py index 44e8076..1345f9e 100644 --- a/pdfwf/parsers/marker.py +++ b/pdfwf/parsers/marker.py @@ -54,7 +54,8 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: """ from marker.convert import convert_single_pdf - full_text, out_meta = convert_single_pdf(pdf_path, self.model_lst) + # Convert the PDF + full_text, _, out_meta = convert_single_pdf(pdf_path, self.model_lst) return full_text, out_meta diff --git a/pdfwf/parsers/pymupdf.py b/pdfwf/parsers/pymupdf.py new file mode 100644 index 0000000..cf172e3 --- /dev/null +++ b/pdfwf/parsers/pymupdf.py @@ -0,0 +1,126 @@ +"""The PyMuPDF PDF parser.""" + +from __future__ import annotations + +import re +from typing import Any +from typing import Literal + +import fitz + +from pdfwf.parsers.base import BaseParser +from pdfwf.parsers.base import BaseParserConfig +from pdfwf.utils import exception_handler + +__all__ = [ + 'PyMuPDFParser', + 'PyMuPDFParserConfig', +] + + +class PyMuPDFParserConfig(BaseParserConfig): + """Settings for the PyMuPDF-PDF parser.""" + + # The name of the parser. + name: Literal['pymupdf'] = 'pymupdf' # type: ignore[assignment] + + +class PyMuPDFParser(BaseParser): + """Interface for the PyMuPDF PDF parser.""" + + def __init__(self, config: PyMuPDFParserConfig) -> None: + """Initialize the marker parser.""" + self.config = config + self.abstract_threshold = 580 + + def extract_doi_info(self, input_str: str) -> str: + """Extract doi from PyMUPDF metadata entry (if present).""" + match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) + return match.group(2) if match else '' + + @exception_handler(default_return=None) + def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: + """Parse a PDF file. + + Parameters + ---------- + pdf_path : str + Path to the PDF file to convert. + + Returns + ------- + tuple[str, dict[str, str]] | None + A tuple containing the full text of the PDF and the metadata + extracted from the PDF. If parsing fails, return None. + """ + # Open pdf + doc = fitz.open(pdf_path) + + # Scrape text + text_list = [] + for page in doc: + text_list.append(page.get_text()) + full_text = '\n'.join(text_list) + + # Get first page (as a proxy for `abstract`) + first_page_text = text_list[0] if len(text_list) > 0 else '' + + # Metadata (available to PyMuPDF) + title = doc.metadata.get('title', '') + authors = doc.metadata.get('author', '') + createdate = doc.metadata.get('creationDate', '') + keywords = doc.metadata.get('keywords', '') + doi = self.extract_doi_info(doc.metadata.get('subject', '')) + prod = doc.metadata.get('producer', '') + form = doc.metadata.get('format', '') + abstract = ( + doc.metadata.get('subject', '') + if len(doc.metadata.get('subject', '')) > self.abstract_threshold + else '' + ) + + # Assemble the metadata + out_meta = { + 'title': title, + 'authors': authors, + 'creationdate': createdate, + 'keywords': keywords, + 'doi': doi, + 'producer': prod, + 'format': form, + 'first_page': first_page_text, + 'abstract': abstract, + } + + # TODO: Should we close the document? + # doc.close() + + # full text & metadata entries + return full_text, out_meta + + @exception_handler(default_return=None) + def parse(self, pdf_files: list[str]) -> list[dict[str, Any]] | None: + """Parse a list of pdf files and return the parsed data.""" + documents = [] + # Process each PDF + for pdf_file in pdf_files: + # Parse the PDF + output = self.parse_pdf(pdf_file) + + # Check if the PDF was parsed successfully + if output is None: + print(f'Error: Failed to parse {pdf_file}') + continue + + # Unpack the output + text, metadata = output + + # Setup the document fields to be stored + document = { + 'text': text, + 'path': str(pdf_file), + 'metadata': metadata, + } + documents.append(document) + + return documents diff --git a/pdfwf/parsers/pypdf.py b/pdfwf/parsers/pypdf.py new file mode 100644 index 0000000..266f043 --- /dev/null +++ b/pdfwf/parsers/pypdf.py @@ -0,0 +1,139 @@ +"""The PyPDF (not PyMuPDF!) PDF parser.""" + +from __future__ import annotations + +import logging +import re +from typing import Any +from typing import Literal + +from pypdf import PdfReader + +from pdfwf.parsers.base import BaseParser +from pdfwf.parsers.base import BaseParserConfig +from pdfwf.utils import exception_handler + +__all__ = [ + 'PyPDFParser', + 'PyPDFParserConfig', +] + + +class PyPDFParserConfig(BaseParserConfig): + """Settings for the pypdf-PDF parser.""" + + # The name of the parser. + name: Literal['pypdf'] = 'pypdf' # type: ignore[assignment] + + +class PyPDFParser(BaseParser): + """Warmstart interface for the PyPDF PDF parser. + + No warmsart eneded as PyPDF is a Python library using CPUs only + """ + + def __init__(self, config: PyPDFParserConfig) -> None: + """Initialize the marker parser.""" + self.config = config + self.abstract_threshold = 580 + + # pypdf is verbose + logging.getLogger().setLevel(logging.ERROR) + + def extract_doi_info(self, input_str: str) -> str: + """Extract doi from pypdf metadata entry (if present).""" + match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) + return match.group(2) if match else '' + + @exception_handler(default_return=None) + def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None: + """Parse a PDF file. + + Parameters + ---------- + pdf_path : str + Path to the PDF file to convert. + + Returns + ------- + tuple[str, dict[str, str]] | None + A tuple containing the full text of the PDF and the metadata + extracted from the PDF. If parsing fails, return None. + """ + # TODO: This needs to be closed + # Open + reader = PdfReader(pdf_path) + + # Scrape text + full_text = '' + for page in reader.pages: + full_text += page.extract_text(extraction_mode='layout') + + first_page_text = ( + reader.pages[0].extract_text(extraction_mode='layout') + if len(reader.pages[0]) > 0 + else '' + ) + meta = reader.metadata + + # Metadata (available to pypdf) + title = meta.get('/Title', '') + authors = meta.get('/Author', '') + createdate = meta.get('/CreationDate', '') + keywords = meta.get('/Keywords', '') + # Use .get() to handle the missing DOI key + doi = ( + meta.get('/doi', '') + if meta.get('/doi', '') != '' + else self.extract_doi_info(meta.get('/Subject', '')) + ) + prod = meta.get('/Producer', '') + # Not included for pypdf, so we set it directly + form = meta.get('/Format', '') + abstract = ( + meta.get('/Subject', '') + if len(meta.get('/Subject', '')) > self.abstract_threshold + else '' + ) + + # Assemble the metadata + out_meta = { + 'title': title, + 'authors': authors, + 'createdate': createdate, + 'keywords': keywords, + 'doi': doi, + 'producer': prod, + 'format': form, + 'first_page': first_page_text, + 'abstract': abstract, + } + + return full_text, out_meta + + @exception_handler(default_return=None) + def parse(self, pdf_files: list[str]) -> list[dict[str, Any]] | None: + """Parse a list of pdf files and return the parsed data.""" + documents = [] + # Process each PDF + for pdf_file in pdf_files: + # Parse the PDF + output = self.parse_pdf(pdf_file) + + # Check if the PDF was parsed successfully + if output is None: + print(f'Error: Failed to parse {pdf_file}') + continue + + # Unpack the output + text, metadata = output + + # Setup the document fields to be stored + document = { + 'text': text, + 'path': str(pdf_file), + 'metadata': metadata, + } + documents.append(document) + + return documents diff --git a/pyproject.toml b/pyproject.toml index 341c343..bff3465 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta" [project] name = "pdfwf" -version = "0.1.6" +version = "0.1.7" authors = [ {name = "Kyle Hippe", email = "khippe@anl.gov"}, {name = "Alexander Brace", email = "abrace@anl.gov"}, {name = "Ozan Gokdemir", email = "ogokdemir@anl.gov"}, - {name = "Carlo Siebenschuh", email = "siebenschuh@uchicago.edu"} + {name = "Carlo Siebenschuh", email = "siebenschuh@anl.gov"} ] description = "Workflow for running pdf to text conversions at scale" readme = "README.md" @@ -25,6 +25,8 @@ dependencies = [ "parsl>=2024.2.5", "pydantic>=2.6.1", "typer[all]>=0.7.0", + "PyMuPDF>=1.24.0", + "pypdf>=4.2.0" ] [project.urls] diff --git a/requirements/pymupdf_requirements.txt b/requirements/pymupdf_requirements.txt new file mode 100644 index 0000000..ebcb9f8 --- /dev/null +++ b/requirements/pymupdf_requirements.txt @@ -0,0 +1,3 @@ +PyMuPDF==1.24.9 +pypdf==4.3.1 +texify==0.1.10