passing ruff

ramanathanlab · Sep 13, 2024 · 8234c18 · 8234c18
1 parent 57d467b
commit 8234c18
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 62 deletions.
diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ We provide example configurations for each parser in these files:
 - **nougat**: [examples/nougat/nougat_test.yaml](examples/nougat/nougat_test.yaml)
 - **oreo**: [examples/oreo/oreo_test.yaml](examples/oreo/oreo_test.yaml)
 - **pymupdf**: [examples/pymupdf/pymupdf_test.yaml](examples/pymupdf/pymupdf_test.yaml)
-  
+
 **Note**: Please see the comments in the example YAML files for **documentation
  on the settings**.
 
@@ -241,7 +241,7 @@ conda create -n pymupdf-wf python=3.10 -y
 conda activate pymupdf-wf
 pip install -r requirements/pymupdf_requirements.txt
 ```
-to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`. 
+to create a conda environment that serves the PDF extraction tools `PyMuPDF` and `pypdf`.
 Both tools are lightweight and operational from the same conda environment.
 
 ## `pypdf` Pipeline Installation

diff --git a/pdfwf/parsers/__init__.py b/pdfwf/parsers/__init__.py
@@ -16,11 +16,18 @@
 from pdfwf.parsers.pymupdf import PyMuPDFParserConfig
 from pdfwf.parsers.pypdf import PyPDFParser
 from pdfwf.parsers.pypdf import PyPDFParserConfig
-
 from pdfwf.registry import registry
 
-ParserConfigTypes = MarkerParserConfig | OreoParserConfig | NougatParserConfig | PyMuPDFParserConfig | PyPDFParserConfig
-ParserTypes = MarkerParser| NougatParser | OreoParser | PyMuPDFParser | PyPDFParser
+ParserConfigTypes = (
+    MarkerParserConfig
+    | OreoParserConfig
+    | NougatParserConfig
+    | PyMuPDFParserConfig
+    | PyPDFParserConfig
+)
+ParserTypes = (
+    MarkerParser | NougatParser | OreoParser | PyMuPDFParser | PyPDFParser
+)
 
 _ParserTypes = tuple[type[ParserConfigTypes], type[ParserTypes]]
 

diff --git a/pdfwf/parsers/pymupdf.py b/pdfwf/parsers/pymupdf.py
@@ -2,17 +2,16 @@
 
 from __future__ import annotations
 
-import fitz
 import re
-
 from typing import Any
 from typing import Literal
 
+import fitz
+
 from pdfwf.parsers.base import BaseParser
 from pdfwf.parsers.base import BaseParserConfig
 from pdfwf.utils import exception_handler
 
-
 __all__ = [
     'PyMuPDFParser',
     'PyMuPDFParserConfig',
@@ -36,57 +35,58 @@ def __init__(self, config: PyMuPDFParserConfig) -> None:
         """Initialize the marker parser."""
         self.config = config
         self.abstract_threshold = 580
-
-    def extract_doi_info(self, input_str:str) -> str:
-        """
-        Extracts doi from PyMUPDF metadata entry (if present)
-        """
-        match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str) 
+
+    def extract_doi_info(self, input_str: str) -> str:
+        """Extract doi from PyMUPDF metadata entry (if present)."""
+        match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
         if match:
             return match.group(2)
         else:
             return ''
 
-    def convert_single_pdf(self, pdf_path) -> str:
-        """Wraps PyMuPDF functionality"""
+    def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
+        """Wrap PyMuPDF functionality."""
         # open pdf
         doc = fitz.open(pdf_path)
 
         # scrape text
         text_list = []
         for page in doc:
             text_list.append(page.get_text())
-        full_text = "\n".join(text_list)
-        
+        full_text = '\n'.join(text_list)
+
         # get first page (asa proxy for `abstract`)
         first_page_text = text_list[0] if len(text_list) > 0 else ''
-       
+
         # metadata (available to PyMuPDF)
         title = doc.metadata.get('title', '')
         authors = doc.metadata.get('author', '')
         createdate = doc.metadata.get('creationDate', '')
         keywords = doc.metadata.get('keywords', '')
-        doi = self.extract_doi_info(doc.metadata.get('subject', '')) 
+        doi = self.extract_doi_info(doc.metadata.get('subject', ''))
         prod = doc.metadata.get('producer', '')
         form = doc.metadata.get('format', '')
-        abstract = doc.metadata.get('subject', '') if len(doc.metadata.get('subject', '')) > self.abstract_threshold else ''
+        abstract = (
+            doc.metadata.get('subject', '')
+            if len(doc.metadata.get('subject', '')) > self.abstract_threshold
+            else ''
+        )
 
         # - assemble
-        out_meta = {'title' : title, 
-                    'authors' : authors,
-                    'creationdate' : createdate, 
-                    'keywords' : keywords, 
-                    'doi' : doi, 
-                    'producer' : prod, 
-                    'format' : form, 
-                    'first_page' : first_page_text,
-                    'abstract' : abstract,
+        out_meta = {
+            'title': title,
+            'authors': authors,
+            'creationdate': createdate,
+            'keywords': keywords,
+            'doi': doi,
+            'producer': prod,
+            'format': form,
+            'first_page': first_page_text,
+            'abstract': abstract,
         }
-
-        # full text & metadata entries 
-        output = full_text, out_meta
-
-        return output
+
+        # full text & metadata entries
+        return full_text, out_meta
 
     @exception_handler(default_return=None)
     def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
@@ -103,7 +103,6 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
             A tuple containing the full text of the PDF and the metadata
             extracted from the PDF. If parsing fails, return None.
         """
-
         full_text, out_meta = self.convert_single_pdf(pdf_path)
 
         return full_text, out_meta

diff --git a/pdfwf/parsers/pypdf.py b/pdfwf/parsers/pypdf.py
@@ -2,18 +2,17 @@
 
 from __future__ import annotations
 
-from pypdf import PdfReader
-import re
 import logging
-
+import re
 from typing import Any
 from typing import Literal
 
+from pypdf import PdfReader
+
 from pdfwf.parsers.base import BaseParser
 from pdfwf.parsers.base import BaseParserConfig
 from pdfwf.utils import exception_handler
 
-
 __all__ = [
     'PyPDFParser',
     'PyPDFParserConfig',
@@ -41,49 +40,62 @@ def __init__(self, config: PyPDFParserConfig) -> None:
         # pypdf is verbose
         logging.getLogger().setLevel(logging.ERROR)
 
-    def extract_doi_info(self, input_str:str) -> str:
-        """
-        Extracts doi from pypdf metadata entry (if present)
-        """
+    def extract_doi_info(self, input_str: str) -> str:
+        """Extract doi from pypdf metadata entry (if present)."""
         match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
         if match:
             return match.group(2)
         else:
             return ''
 
-    def convert_single_pdf(self, pdf_path) -> str:
-        """Wraps pypdf functionality"""
+    def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
+        """Wrap pypdf functionality."""
         # open
         reader = PdfReader(pdf_path)
 
         # scrape text
-        full_text=''
+        full_text = ''
         for page in reader.pages:
-            full_text += page.extract_text(extraction_mode="layout")
+            full_text += page.extract_text(extraction_mode='layout')
 
-        first_page_text = reader.pages[0].extract_text(extraction_mode="layout") if len(reader.pages[0]) > 0 else ''
+        first_page_text = (
+            reader.pages[0].extract_text(extraction_mode='layout')
+            if len(reader.pages[0]) > 0
+            else ''
+        )
         meta = reader.metadata
 
         # metadata (available to pypdf)
         title = meta.get('/Title', '')
         authors = meta.get('/Author', '')
         createdate = meta.get('/CreationDate', '')
         keywords = meta.get('/Keywords', '')
-        doi = meta.get('/doi', '') if meta.get('/doi', '')!='' else self.extract_doi_info(meta.get('/Subject', ''))  # Use .get() to handle the missing DOI key
+        doi = (
+            meta.get('/doi', '')
+            if meta.get('/doi', '') != ''
+            else self.extract_doi_info(meta.get('/Subject', ''))
+        )  # Use .get() to handle the missing DOI key
         prod = meta.get('/Producer', '')
-        form = meta.get('/Format', '')  # Not included for pypdf, so we set it directly
-        abstract = meta.get('/Subject', '') if len(meta.get('/Subject', '')) > self.abstract_threshold else ''
+        form = meta.get(
+            '/Format', ''
+        )  # Not included for pypdf, so we set it directly
+        abstract = (
+            meta.get('/Subject', '')
+            if len(meta.get('/Subject', '')) > self.abstract_threshold
+            else ''
+        )
 
         # - assemble
-        out_meta = {'title' : title,
-                    'authors' : authors,
-                    'createdate' : createdate,
-                    'keywords' : keywords,
-                    'doi' : doi,
-                    'producer' : prod,
-                    'format' : form,
-                    'first_page' : first_page_text,
-                    'abstract' : abstract,
+        out_meta = {
+            'title': title,
+            'authors': authors,
+            'createdate': createdate,
+            'keywords': keywords,
+            'doi': doi,
+            'producer': prod,
+            'format': form,
+            'first_page': first_page_text,
+            'abstract': abstract,
         }
 
         # full text & metadata entries
@@ -106,7 +118,6 @@ def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
             A tuple containing the full text of the PDF and the metadata
             extracted from the PDF. If parsing fails, return None.
         """
-
         full_text, out_meta = self.convert_single_pdf(pdf_path)
 
         return full_text, out_meta