diff --git a/README.md b/README.md
index 0bbfeb5..f8d7a65 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ to run on other HPC systems by adding an appropriate
 - [Install Marker](#marker-pipeline-installation)
 - [Install Nougat](#nougat-pipeline-installation)
 - [Install Oreo](#oreo-pipeline-installation)
+- [Install PyMuPDF](#pymupdf-pipeline-installation)
 
 **Note**: You may need different virtual environments for each parser.
 
@@ -74,7 +75,7 @@ compute_settings:
   # The number of compute nodes to use
   num_nodes: 1
   # Make sure to update the path to your conda environment and HF cache
-  worker_init: "module load conda/2023-10-04; conda activate marker-wf; export HF_HOME=<path-to-your-HF-cache-dir>"
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=<path-to-your-HF-cache-dir>"
   # The scheduler options to use when submitting jobs
   scheduler_options: "#PBS -l filesystems=home:eagle:grand"
   # Make sure to change the account to the account you want to charge
@@ -89,6 +90,7 @@ We provide example configurations for each parser in these files:
 - **marker**: [examples/marker/marker_test.yaml](examples/marker/marker_test.yaml)
 - **nougat**: [examples/nougat/nougat_test.yaml](examples/nougat/nougat_test.yaml)
 - **oreo**: [examples/oreo/oreo_test.yaml](examples/oreo/oreo_test.yaml)
+- **pymupdf**: [examples/pymupdf/pymupdf_test.yaml](examples/pymupdf/pymupdf_test.yaml)
 
 **Note**: Please see the comments in the example YAML files for **documentation
  on the settings**.
@@ -175,7 +177,7 @@ pip install pypdf
 On a compute node, run:
 ```bash
 # Create a conda environment with python3.10
-module load conda/2023-10-04
+module use /soft/modulefiles; module load conda/2024-04-29
 conda create -n nougat-wf python=3.10
 conda activate nougat-wf
 
@@ -203,7 +205,7 @@ If `Nougat` inference ran successfully, proceed to install the `pdfwf` workflow
 ## `Oreo` Pipeline Installation
 On a compute node, run:
 ```bash
-module load conda/2023-10-04
+module use /soft/modulefiles; module load conda/2024-04-29
 conda create -n pdfwf python=3.10 -y
 conda activate pdfwf
 mamba install pytorch torchvision pytorch-cuda=11.8 -c pytorch -c nvidia -y
@@ -213,6 +215,20 @@ git clone git@github.com:ultralytics/yolov5.git
 
 Then set the `yolov5_path` option to the path of the cloned `yolov5` repository.
 
+## `PyMuPDF` Pipeline Installation
+On any node, run:
+```bash
+module use /soft/modulefiles; module load conda/2024-04-29
+conda create -n pymupdf-wf python=3.10 -y
+conda activate pymupdf-wf
+pip install -r requirements/pymupdf_requirements.txt
+```
+to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`.
+Both tools are lightweight and operational from the same conda environment.
+
+## `pypdf` Pipeline Installation
+Both PDF extraction tools `PyMuPDF` and `pypdf` are fairly lightweight and operate in the conda environment `pymupdf-wf`.
+
 ## CLI
 For running smaller jobs without using the Parsl workflow, the CLI can be used.
 The CLI provides different commands for running the various parsers. The CLI
diff --git a/examples/marker/marker_joint.yaml b/examples/marker/marker_joint.yaml
new file mode 100644
index 0000000..3941d34
--- /dev/null
+++ b/examples/marker/marker_joint.yaml
@@ -0,0 +1,30 @@
+# The directory containing the pdfs to convert
+pdf_dir: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The directory to place the converted pdfs in
+out_dir: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_marker
+
+# Chunk size for pdf parsing
+chunk_size: 100
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: marker
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=/eagle/projects/argonne_tpc/siebenschuh/HF_cache"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/marker/marker_small_test.yaml b/examples/marker/marker_small_test.yaml
new file mode 100644
index 0000000..ed5d1b7
--- /dev/null
+++ b/examples/marker/marker_small_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_marker
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: marker
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=/eagle/projects/argonne_tpc/siebenschuh/HF_cache"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:30:00
diff --git a/examples/nougat/additional_merged_pdfs.yaml b/examples/nougat/additional_merged_pdfs.yaml
new file mode 100644
index 0000000..b387500
--- /dev/null
+++ b/examples/nougat/additional_merged_pdfs.yaml
@@ -0,0 +1,47 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/additional_pdf_only_data
+
+# The output directory of the workflow
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/additional_pdf_only_data_to_nougat
+
+# Only convert the first 100 pdfs for testing
+num_conversions: 25000
+
+# The number of PDFs to parse in each parsl task
+chunk_size: 5
+
+parser_settings:
+  name: "nougat"
+  # Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB.
+  batchsize: 10
+  # Path to download the checkpoint to. if already exists, will use existing.
+  checkpoint: /eagle/projects/argonne_tpc/siebenschuh/nougat-checkpoint
+  # Set mmd_out to null if you don't want to write mmd files as a byproduct.
+  mmd_out: /eagle/projects/argonne_tpc/siebenschuh/additional_merged_pdfs/mmd_output
+  # If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out
+  recompute: false
+  # Set to true if you want to use fp32 instead of bfloat16 (false is recommended)
+  full_precision: false
+  # If set to true, output text will be formatted as a markdown file.
+  markdown: true
+  # Preempt processing a paper if mode collapse causes repetitions (true is recommended)
+  skipping: true
+  # Path for the nougat-specific logs for the run.
+  nougat_logs_path: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/logs
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate nougat-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/nougat/all_merged_pdfs.yaml b/examples/nougat/all_merged_pdfs.yaml
new file mode 100644
index 0000000..6a59fa4
--- /dev/null
+++ b/examples/nougat/all_merged_pdfs.yaml
@@ -0,0 +1,47 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/merged_pdf_only_data
+
+# The output directory of the workflow
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/merged_pdf_only_data_to_nougat
+
+# Only convert the first 100 pdfs for testing
+num_conversions: 25000
+
+# The number of PDFs to parse in each parsl task
+chunk_size: 5
+
+parser_settings:
+  name: "nougat"
+  # Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB.
+  batchsize: 10
+  # Path to download the checkpoint to. if already exists, will use existing.
+  checkpoint: /eagle/projects/argonne_tpc/siebenschuh/nougat-checkpoint
+  # Set mmd_out to null if you don't want to write mmd files as a byproduct.
+  mmd_out: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/mmd_output
+  # If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out
+  recompute: false
+  # Set to true if you want to use fp32 instead of bfloat16 (false is recommended)
+  full_precision: false
+  # If set to true, output text will be formatted as a markdown file.
+  markdown: true
+  # Preempt processing a paper if mode collapse causes repetitions (true is recommended)
+  skipping: true
+  # Path for the nougat-specific logs for the run.
+  nougat_logs_path: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/logs
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate nougat-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/oreo/oreo_joint.yaml b/examples/oreo/oreo_joint.yaml
new file mode 100644
index 0000000..ad4b333
--- /dev/null
+++ b/examples/oreo/oreo_joint.yaml
@@ -0,0 +1,61 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The output directory of the workflow
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_oreo
+
+# Only convert the first 100 pdfs for testing
+num_conversions: 100
+
+# The number of PDFs to parse in each parsl task
+chunk_size: 4
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use.
+  name: oreo
+  # Weights to layout detection model.
+  detection_weights_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/model_weights/yolov5_detection_weights.pt
+  # Model weights for (meta) text classifier.
+  text_cls_weights_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/text_classifier/meta_text_classifier
+  # Path to the SPV05 category file.
+  spv05_category_file_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/meta/spv05_categories.yaml
+  # Only scan PDFs for meta statistics on its attributes.
+  detect_only: false
+  # Only parse PDFs for meta data.
+  meta_only: false
+  # Include equations into the text categories
+  equation: false
+  # Include table visualizations (will be stored).
+  table: false
+  # Include figure  (will be stored).
+  figure: false
+  # Include secondary meta data (footnote, headers).
+  secondary_meta: false
+  # If true, accelerate inference by packing non-meta text patches.
+  accelerate: false
+  # Main batch size for detection/# of images loaded per batch.
+  batch_yolo: 2
+  # Batch size of pre-processed patches for ViT pseudo-OCR inference.
+  batch_vit: 2
+  # Batch size K for subsequent text processing.
+  batch_cls: 2
+  # Number of pixels along which.
+  bbox_offset: 2
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pdfwf; export HF_HOME=/lus/eagle/projects/CVD-Mol-AI/braceal/cache/huggingface"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 01:00:00
diff --git a/examples/pymupdf/pymupdf_joint.yaml b/examples/pymupdf/pymupdf_joint.yaml
new file mode 100644
index 0000000..5d36a14
--- /dev/null
+++ b/examples/pymupdf/pymupdf_joint.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_pymupdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pymupdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: argonne_tpc
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/pymupdf/pymupdf_small_test.yaml b/examples/pymupdf/pymupdf_small_test.yaml
new file mode 100644
index 0000000..b05bc9e
--- /dev/null
+++ b/examples/pymupdf/pymupdf_small_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pymupdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pymupdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:15:00
diff --git a/examples/pymupdf/pymupdf_test.yaml b/examples/pymupdf/pymupdf_test.yaml
new file mode 100644
index 0000000..faf6867
--- /dev/null
+++ b/examples/pymupdf/pymupdf_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pymupdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pymupdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: argonne_tpc
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:15:00
diff --git a/examples/pypdf/pypdf_joint.yaml b/examples/pypdf/pypdf_joint.yaml
new file mode 100644
index 0000000..85d4654
--- /dev/null
+++ b/examples/pypdf/pypdf_joint.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_pypdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pypdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment (same conda env as for PyMuPDF!)
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: argonne_tpc
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/pypdf/pypdf_test.yaml b/examples/pypdf/pypdf_test.yaml
new file mode 100644
index 0000000..1cb0315
--- /dev/null
+++ b/examples/pypdf/pypdf_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pypdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pypdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment (same conda env as for PyMuPDF!)
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: argonne_tpc
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:15:00
diff --git a/pdfwf/parsers/__init__.py b/pdfwf/parsers/__init__.py
index cbd0057..b499bbd 100644
--- a/pdfwf/parsers/__init__.py
+++ b/pdfwf/parsers/__init__.py
@@ -12,10 +12,22 @@
 from pdfwf.parsers.nougat_ import NougatParserConfig
 from pdfwf.parsers.oreo import OreoParser
 from pdfwf.parsers.oreo import OreoParserConfig
+from pdfwf.parsers.pymupdf import PyMuPDFParser
+from pdfwf.parsers.pymupdf import PyMuPDFParserConfig
+from pdfwf.parsers.pypdf import PyPDFParser
+from pdfwf.parsers.pypdf import PyPDFParserConfig
 from pdfwf.registry import registry
 
-ParserConfigTypes = MarkerParserConfig | OreoParserConfig | NougatParserConfig
-ParserTypes = MarkerParser | OreoParser | NougatParser
+ParserConfigTypes = (
+    MarkerParserConfig
+    | OreoParserConfig
+    | NougatParserConfig
+    | PyMuPDFParserConfig
+    | PyPDFParserConfig
+)
+ParserTypes = (
+    MarkerParser | NougatParser | OreoParser | PyMuPDFParser | PyPDFParser
+)
 
 _ParserTypes = tuple[type[ParserConfigTypes], type[ParserTypes]]
 
@@ -23,6 +35,8 @@
     'marker': (MarkerParserConfig, MarkerParser),
     'oreo': (OreoParserConfig, OreoParser),
     'nougat': (NougatParserConfig, NougatParser),
+    'pymupdf': (PyMuPDFParserConfig, PyMuPDFParser),
+    'pypdf': (PyPDFParserConfig, PyPDFParser),
 }
 
 
@@ -54,6 +68,8 @@ def get_parser(
     - marker
     - oreo
     - nougat
+    - pymupdf
+    - pypdf
 
     Parameters
     ----------
diff --git a/pdfwf/parsers/marker.py b/pdfwf/parsers/marker.py
index 44e8076..1345f9e 100644
--- a/pdfwf/parsers/marker.py
+++ b/pdfwf/parsers/marker.py
@@ -54,7 +54,8 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
         """
         from marker.convert import convert_single_pdf
 
-        full_text, out_meta = convert_single_pdf(pdf_path, self.model_lst)
+        # Convert the PDF
+        full_text, _, out_meta = convert_single_pdf(pdf_path, self.model_lst)
 
         return full_text, out_meta
 
diff --git a/pdfwf/parsers/pymupdf.py b/pdfwf/parsers/pymupdf.py
new file mode 100644
index 0000000..cf172e3
--- /dev/null
+++ b/pdfwf/parsers/pymupdf.py
@@ -0,0 +1,126 @@
+"""The PyMuPDF PDF parser."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+from typing import Literal
+
+import fitz
+
+from pdfwf.parsers.base import BaseParser
+from pdfwf.parsers.base import BaseParserConfig
+from pdfwf.utils import exception_handler
+
+__all__ = [
+    'PyMuPDFParser',
+    'PyMuPDFParserConfig',
+]
+
+
+class PyMuPDFParserConfig(BaseParserConfig):
+    """Settings for the PyMuPDF-PDF parser."""
+
+    # The name of the parser.
+    name: Literal['pymupdf'] = 'pymupdf'  # type: ignore[assignment]
+
+
+class PyMuPDFParser(BaseParser):
+    """Interface for the PyMuPDF PDF parser."""
+
+    def __init__(self, config: PyMuPDFParserConfig) -> None:
+        """Initialize the marker parser."""
+        self.config = config
+        self.abstract_threshold = 580
+
+    def extract_doi_info(self, input_str: str) -> str:
+        """Extract doi from PyMUPDF metadata entry (if present)."""
+        match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
+        return match.group(2) if match else ''
+
+    @exception_handler(default_return=None)
+    def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
+        """Parse a PDF file.
+
+        Parameters
+        ----------
+        pdf_path : str
+            Path to the PDF file to convert.
+
+        Returns
+        -------
+        tuple[str, dict[str, str]] | None
+            A tuple containing the full text of the PDF and the metadata
+            extracted from the PDF. If parsing fails, return None.
+        """
+        # Open pdf
+        doc = fitz.open(pdf_path)
+
+        # Scrape text
+        text_list = []
+        for page in doc:
+            text_list.append(page.get_text())
+        full_text = '\n'.join(text_list)
+
+        # Get first page (as a proxy for `abstract`)
+        first_page_text = text_list[0] if len(text_list) > 0 else ''
+
+        # Metadata (available to PyMuPDF)
+        title = doc.metadata.get('title', '')
+        authors = doc.metadata.get('author', '')
+        createdate = doc.metadata.get('creationDate', '')
+        keywords = doc.metadata.get('keywords', '')
+        doi = self.extract_doi_info(doc.metadata.get('subject', ''))
+        prod = doc.metadata.get('producer', '')
+        form = doc.metadata.get('format', '')
+        abstract = (
+            doc.metadata.get('subject', '')
+            if len(doc.metadata.get('subject', '')) > self.abstract_threshold
+            else ''
+        )
+
+        # Assemble the metadata
+        out_meta = {
+            'title': title,
+            'authors': authors,
+            'creationdate': createdate,
+            'keywords': keywords,
+            'doi': doi,
+            'producer': prod,
+            'format': form,
+            'first_page': first_page_text,
+            'abstract': abstract,
+        }
+
+        # TODO: Should we close the document?
+        # doc.close()
+
+        # full text & metadata entries
+        return full_text, out_meta
+
+    @exception_handler(default_return=None)
+    def parse(self, pdf_files: list[str]) -> list[dict[str, Any]] | None:
+        """Parse a list of pdf files and return the parsed data."""
+        documents = []
+        # Process each PDF
+        for pdf_file in pdf_files:
+            # Parse the PDF
+            output = self.parse_pdf(pdf_file)
+
+            # Check if the PDF was parsed successfully
+            if output is None:
+                print(f'Error: Failed to parse {pdf_file}')
+                continue
+
+            # Unpack the output
+            text, metadata = output
+
+            # Setup the document fields to be stored
+            document = {
+                'text': text,
+                'path': str(pdf_file),
+                'metadata': metadata,
+            }
+            documents.append(document)
+
+        return documents
diff --git a/pdfwf/parsers/pypdf.py b/pdfwf/parsers/pypdf.py
new file mode 100644
index 0000000..266f043
--- /dev/null
+++ b/pdfwf/parsers/pypdf.py
@@ -0,0 +1,139 @@
+"""The PyPDF (not PyMuPDF!) PDF parser."""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Any
+from typing import Literal
+
+from pypdf import PdfReader
+
+from pdfwf.parsers.base import BaseParser
+from pdfwf.parsers.base import BaseParserConfig
+from pdfwf.utils import exception_handler
+
+__all__ = [
+    'PyPDFParser',
+    'PyPDFParserConfig',
+]
+
+
+class PyPDFParserConfig(BaseParserConfig):
+    """Settings for the pypdf-PDF parser."""
+
+    # The name of the parser.
+    name: Literal['pypdf'] = 'pypdf'  # type: ignore[assignment]
+
+
+class PyPDFParser(BaseParser):
+    """Warmstart interface for the PyPDF PDF parser.
+
+    No warmsart eneded as PyPDF is a Python library using CPUs only
+    """
+
+    def __init__(self, config: PyPDFParserConfig) -> None:
+        """Initialize the marker parser."""
+        self.config = config
+        self.abstract_threshold = 580
+
+        # pypdf is verbose
+        logging.getLogger().setLevel(logging.ERROR)
+
+    def extract_doi_info(self, input_str: str) -> str:
+        """Extract doi from pypdf metadata entry (if present)."""
+        match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
+        return match.group(2) if match else ''
+
+    @exception_handler(default_return=None)
+    def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
+        """Parse a PDF file.
+
+        Parameters
+        ----------
+        pdf_path : str
+            Path to the PDF file to convert.
+
+        Returns
+        -------
+        tuple[str, dict[str, str]] | None
+            A tuple containing the full text of the PDF and the metadata
+            extracted from the PDF. If parsing fails, return None.
+        """
+        # TODO: This needs to be closed
+        # Open
+        reader = PdfReader(pdf_path)
+
+        # Scrape text
+        full_text = ''
+        for page in reader.pages:
+            full_text += page.extract_text(extraction_mode='layout')
+
+        first_page_text = (
+            reader.pages[0].extract_text(extraction_mode='layout')
+            if len(reader.pages[0]) > 0
+            else ''
+        )
+        meta = reader.metadata
+
+        # Metadata (available to pypdf)
+        title = meta.get('/Title', '')
+        authors = meta.get('/Author', '')
+        createdate = meta.get('/CreationDate', '')
+        keywords = meta.get('/Keywords', '')
+        # Use .get() to handle the missing DOI key
+        doi = (
+            meta.get('/doi', '')
+            if meta.get('/doi', '') != ''
+            else self.extract_doi_info(meta.get('/Subject', ''))
+        )
+        prod = meta.get('/Producer', '')
+        # Not included for pypdf, so we set it directly
+        form = meta.get('/Format', '')
+        abstract = (
+            meta.get('/Subject', '')
+            if len(meta.get('/Subject', '')) > self.abstract_threshold
+            else ''
+        )
+
+        # Assemble the metadata
+        out_meta = {
+            'title': title,
+            'authors': authors,
+            'createdate': createdate,
+            'keywords': keywords,
+            'doi': doi,
+            'producer': prod,
+            'format': form,
+            'first_page': first_page_text,
+            'abstract': abstract,
+        }
+
+        return full_text, out_meta
+
+    @exception_handler(default_return=None)
+    def parse(self, pdf_files: list[str]) -> list[dict[str, Any]] | None:
+        """Parse a list of pdf files and return the parsed data."""
+        documents = []
+        # Process each PDF
+        for pdf_file in pdf_files:
+            # Parse the PDF
+            output = self.parse_pdf(pdf_file)
+
+            # Check if the PDF was parsed successfully
+            if output is None:
+                print(f'Error: Failed to parse {pdf_file}')
+                continue
+
+            # Unpack the output
+            text, metadata = output
+
+            # Setup the document fields to be stored
+            document = {
+                'text': text,
+                'path': str(pdf_file),
+                'metadata': metadata,
+            }
+            documents.append(document)
+
+        return documents
diff --git a/pyproject.toml b/pyproject.toml
index 341c343..bff3465 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "pdfwf"
-version = "0.1.6"
+version = "0.1.7"
 authors = [
     {name = "Kyle Hippe", email = "khippe@anl.gov"},
     {name = "Alexander Brace", email = "abrace@anl.gov"},
     {name = "Ozan Gokdemir", email = "ogokdemir@anl.gov"},
-    {name = "Carlo Siebenschuh", email = "siebenschuh@uchicago.edu"}
+    {name = "Carlo Siebenschuh", email = "siebenschuh@anl.gov"}
 ]
 description = "Workflow for running pdf to text conversions at scale"
 readme = "README.md"
@@ -25,6 +25,8 @@ dependencies = [
     "parsl>=2024.2.5",
     "pydantic>=2.6.1",
     "typer[all]>=0.7.0",
+    "PyMuPDF>=1.24.0",
+    "pypdf>=4.2.0"
 ]
 
 [project.urls]
diff --git a/requirements/pymupdf_requirements.txt b/requirements/pymupdf_requirements.txt
new file mode 100644
index 0000000..ebcb9f8
--- /dev/null
+++ b/requirements/pymupdf_requirements.txt
@@ -0,0 +1,3 @@
+PyMuPDF==1.24.9
+pypdf==4.3.1
+texify==0.1.10