ramanathanlab · braceal · Sep 17, 2024 · Aug 13, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ to run on other HPC systems by adding an appropriate
 - [Install Marker](#marker-pipeline-installation)
 - [Install Nougat](#nougat-pipeline-installation)
 - [Install Oreo](#oreo-pipeline-installation)
+- [Install PyMuPDF](#pymupdf-pipeline-installation)
 
 **Note**: You may need different virtual environments for each parser.
 
@@ -74,7 +75,7 @@ compute_settings:
   # The number of compute nodes to use
   num_nodes: 1
   # Make sure to update the path to your conda environment and HF cache
-  worker_init: "module load conda/2023-10-04; conda activate marker-wf; export HF_HOME=<path-to-your-HF-cache-dir>"
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=<path-to-your-HF-cache-dir>"
   # The scheduler options to use when submitting jobs
   scheduler_options: "#PBS -l filesystems=home:eagle:grand"
   # Make sure to change the account to the account you want to charge
@@ -89,6 +90,7 @@ We provide example configurations for each parser in these files:
 - **marker**: [examples/marker/marker_test.yaml](examples/marker/marker_test.yaml)
 - **nougat**: [examples/nougat/nougat_test.yaml](examples/nougat/nougat_test.yaml)
 - **oreo**: [examples/oreo/oreo_test.yaml](examples/oreo/oreo_test.yaml)
+- **pymupdf**: [examples/pymupdf/pymupdf_test.yaml](examples/pymupdf/pymupdf_test.yaml)
 
 **Note**: Please see the comments in the example YAML files for **documentation
  on the settings**.
@@ -158,7 +160,7 @@ mkdir wf-validation
 cd wf-validation/
 
 # Create a base conda environment
-module load conda/2023-10-04
+module use /soft/modulefiles; module load conda/2024-04-29
 conda create -n marker-wf python=3.10
 conda activate marker-wf
 
@@ -200,7 +202,7 @@ echo "INFERENCE_RAM=40" >> marker/local.env
 On a compute node, run:
 ```bash
 # Create a conda environment with python3.10
-module load conda/2023-10-04
+module use /soft/modulefiles; module load conda/2024-04-29
 conda create -n nougat-wf python=3.10
 conda activate nougat-wf
 
@@ -228,7 +230,7 @@ If `Nougat` inference ran successfully, proceed to install the `pdfwf` workflow
 ## `Oreo` Pipeline Installation
 On a compute node, run:
 ```bash
-module load conda/2023-10-04
+module use /soft/modulefiles; module load conda/2024-04-29
 conda create -n pdfwf python=3.10 -y
 conda activate pdfwf
 mamba install pytorch torchvision pytorch-cuda=11.8 -c pytorch -c nvidia -y
@@ -238,6 +240,20 @@ git clone [email protected]:ultralytics/yolov5.git
 
 Then set the `yolov5_path` option to the path of the cloned `yolov5` repository.
 
+## `PyMuPDF` Pipeline Installation
+On any node, run:
+```bash
+module use /soft/modulefiles; module load conda/2024-04-29
+conda create -n pymupdf-wf python=3.10 -y
+conda activate pymupdf-wf
+pip install -r requirements/pymupdf_requirements.txt
+```
+to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`.
+Both tools are lightweight and operational from the same conda environment.
+
+## `pypdf` Pipeline Installation
+Both PDF extraction tools `PyMuPDF` and `pypdf` are fairly lightweight and operate in the conda environment `pymupdf-wf`.
+
 ## CLI
 For running smaller jobs without using the Parsl workflow, the CLI can be used.
 The CLI provides different commands for running the various parsers. The CLI

diff --git a/examples/marker/marker_joint.yaml b/examples/marker/marker_joint.yaml
@@ -0,0 +1,30 @@
+# The directory containing the pdfs to convert
+pdf_dir: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The directory to place the converted pdfs in
+out_dir: /lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_marker
+
+# Chunk size for pdf parsing
+chunk_size: 100
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: marker
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=/eagle/projects/argonne_tpc/siebenschuh/HF_cache"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/marker/marker_small_test.yaml b/examples/marker/marker_small_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_marker
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: marker
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate marker-wf; export HF_HOME=/eagle/projects/argonne_tpc/siebenschuh/HF_cache"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:30:00
diff --git a/examples/nougat/additional_merged_pdfs.yaml b/examples/nougat/additional_merged_pdfs.yaml
@@ -0,0 +1,47 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/additional_pdf_only_data
+
+# The output directory of the workflow
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/additional_pdf_only_data_to_nougat
+
+# Only convert the first 100 pdfs for testing
+num_conversions: 25000
+
+# The number of PDFs to parse in each parsl task
+chunk_size: 5
+
+parser_settings:
+  name: "nougat"
+  # Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB.
+  batchsize: 10
+  # Path to download the checkpoint to. if already exists, will use existing.
+  checkpoint: /eagle/projects/argonne_tpc/siebenschuh/nougat-checkpoint
+  # Set mmd_out to null if you don't want to write mmd files as a byproduct.
+  mmd_out: /eagle/projects/argonne_tpc/siebenschuh/additional_merged_pdfs/mmd_output
+  # If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out
+  recompute: false
+  # Set to true if you want to use fp32 instead of bfloat16 (false is recommended)
+  full_precision: false
+  # If set to true, output text will be formatted as a markdown file.
+  markdown: true
+  # Preempt processing a paper if mode collapse causes repetitions (true is recommended)
+  skipping: true
+  # Path for the nougat-specific logs for the run.
+  nougat_logs_path: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/logs
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate nougat-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/nougat/all_merged_pdfs.yaml b/examples/nougat/all_merged_pdfs.yaml
@@ -0,0 +1,47 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/merged_pdf_only_data
+
+# The output directory of the workflow
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/merged_pdf_only_data_to_nougat
+
+# Only convert the first 100 pdfs for testing
+num_conversions: 25000
+
+# The number of PDFs to parse in each parsl task
+chunk_size: 5
+
+parser_settings:
+  name: "nougat"
+  # Recommended batch size of pages (not pdfs) is 10, maximum that fits into A100 40GB.
+  batchsize: 10
+  # Path to download the checkpoint to. if already exists, will use existing.
+  checkpoint: /eagle/projects/argonne_tpc/siebenschuh/nougat-checkpoint
+  # Set mmd_out to null if you don't want to write mmd files as a byproduct.
+  mmd_out: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/mmd_output
+  # If set to false, a will skip the pdfs that already have a parsed mmd file in mmd_out
+  recompute: false
+  # Set to true if you want to use fp32 instead of bfloat16 (false is recommended)
+  full_precision: false
+  # If set to true, output text will be formatted as a markdown file.
+  markdown: true
+  # Preempt processing a paper if mode collapse causes repetitions (true is recommended)
+  skipping: true
+  # Path for the nougat-specific logs for the run.
+  nougat_logs_path: /eagle/projects/argonne_tpc/siebenschuh/all_merged_pdfs/logs
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 10
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate nougat-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/oreo/oreo_joint.yaml b/examples/oreo/oreo_joint.yaml
@@ -0,0 +1,61 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The output directory of the workflow
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_oreo
+
+# Only convert the first 100 pdfs for testing
+num_conversions: 100
+
+# The number of PDFs to parse in each parsl task
+chunk_size: 4
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use.
+  name: oreo
+  # Weights to layout detection model.
+  detection_weights_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/model_weights/yolov5_detection_weights.pt
+  # Model weights for (meta) text classifier.
+  text_cls_weights_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/text_classifier/meta_text_classifier
+  # Path to the SPV05 category file.
+  spv05_category_file_path: /lus/eagle/projects/argonne_tpc/siebenschuh/N-O-REO/meta/spv05_categories.yaml
+  # Only scan PDFs for meta statistics on its attributes.
+  detect_only: false
+  # Only parse PDFs for meta data.
+  meta_only: false
+  # Include equations into the text categories
+  equation: false
+  # Include table visualizations (will be stored).
+  table: false
+  # Include figure  (will be stored).
+  figure: false
+  # Include secondary meta data (footnote, headers).
+  secondary_meta: false
+  # If true, accelerate inference by packing non-meta text patches.
+  accelerate: false
+  # Main batch size for detection/# of images loaded per batch.
+  batch_yolo: 2
+  # Batch size of pre-processed patches for ViT pseudo-OCR inference.
+  batch_vit: 2
+  # Batch size K for subsequent text processing.
+  batch_cls: 2
+  # Number of pixels along which.
+  bbox_offset: 2
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pdfwf; export HF_HOME=/lus/eagle/projects/CVD-Mol-AI/braceal/cache/huggingface"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug-scaling
+  # The amount of time to request for your job
+  walltime: 01:00:00
diff --git a/examples/pymupdf/pymupdf_joint.yaml b/examples/pymupdf/pymupdf_joint.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/joint_to_pymupdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pymupdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: argonne_tpc
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:60:00
diff --git a/examples/pymupdf/pymupdf_small_test.yaml b/examples/pymupdf/pymupdf_small_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pymupdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pymupdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: FoundEpidem
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:15:00
diff --git a/examples/pymupdf/pymupdf_test.yaml b/examples/pymupdf/pymupdf_test.yaml
@@ -0,0 +1,27 @@
+# The directory containing the pdfs to convert
+pdf_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set
+
+# The directory to place the converted pdfs in
+out_dir: /eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/tiny_test_pdf_set_to_pymupdf
+
+# The settings for the pdf parser
+parser_settings:
+  # The name of the parser to use
+  name: pymupdf
+
+# The compute settings for the workflow
+compute_settings:
+  # The name of the compute platform to use
+  name: polaris
+  # The number of compute nodes to use
+  num_nodes: 1
+  # Make sure to update the path to your conda environment and HF cache
+  worker_init: "module use /soft/modulefiles; module load conda/2024-04-29; conda activate pymupdf-wf"
+  # The scheduler options to use when submitting jobs
+  scheduler_options: "#PBS -l filesystems=home:eagle"
+  # Make sure to change the account to the account you want to charge
+  account: argonne_tpc
+  # The HPC queue to submit to
+  queue: debug
+  # The amount of time to request for your job
+  walltime: 00:15:00