refactor

ramanathanlab · Sep 13, 2024 · aa7cf20 · aa7cf20
1 parent 8234c18
commit aa7cf20
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 72 deletions.
diff --git a/pdfwf/parsers/pymupdf.py b/pdfwf/parsers/pymupdf.py
@@ -26,10 +26,7 @@ class PyMuPDFParserConfig(BaseParserConfig):
 
 
 class PyMuPDFParser(BaseParser):
-    """Warmstart interface for the PyMuPDF PDF parser.
-
-    No warmsart eneded as PyMuPDF is a Python library using CPUs only
-    """
+    """Interface for the PyMuPDF PDF parser."""
 
     def __init__(self, config: PyMuPDFParserConfig) -> None:
         """Initialize the marker parser."""
@@ -39,26 +36,36 @@ def __init__(self, config: PyMuPDFParserConfig) -> None:
     def extract_doi_info(self, input_str: str) -> str:
         """Extract doi from PyMUPDF metadata entry (if present)."""
         match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
-        if match:
-            return match.group(2)
-        else:
-            return ''
-
-    def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
-        """Wrap PyMuPDF functionality."""
-        # open pdf
+        return match.group(2) if match else ''
+
+    @exception_handler(default_return=None)
+    def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
+        """Parse a PDF file.
+
+        Parameters
+        ----------
+        pdf_path : str
+            Path to the PDF file to convert.
+
+        Returns
+        -------
+        tuple[str, dict[str, str]] | None
+            A tuple containing the full text of the PDF and the metadata
+            extracted from the PDF. If parsing fails, return None.
+        """
+        # Open pdf
         doc = fitz.open(pdf_path)
 
-        # scrape text
+        # Scrape text
         text_list = []
         for page in doc:
             text_list.append(page.get_text())
         full_text = '\n'.join(text_list)
 
-        # get first page (asa proxy for `abstract`)
+        # Get first page (as a proxy for `abstract`)
         first_page_text = text_list[0] if len(text_list) > 0 else ''
 
-        # metadata (available to PyMuPDF)
+        # Metadata (available to PyMuPDF)
         title = doc.metadata.get('title', '')
         authors = doc.metadata.get('author', '')
         createdate = doc.metadata.get('creationDate', '')
@@ -72,7 +79,7 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
             else ''
         )
 
-        # - assemble
+        # Assemble the metadata
         out_meta = {
             'title': title,
             'authors': authors,
@@ -85,26 +92,10 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
             'abstract': abstract,
         }
 
-        # full text & metadata entries
-        return full_text, out_meta
-
-    @exception_handler(default_return=None)
-    def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
-        """Parse a PDF file and extract markdown.
-
-        Parameters
-        ----------
-        pdf_path : str
-            Path to the PDF file to convert.
-
-        Returns
-        -------
-        tuple[str, dict[str, str]] | None
-            A tuple containing the full text of the PDF and the metadata
-            extracted from the PDF. If parsing fails, return None.
-        """
-        full_text, out_meta = self.convert_single_pdf(pdf_path)
+        # TODO: Should we close the document?
+        # doc.close()
 
+        # full text & metadata entries
         return full_text, out_meta
 
     @exception_handler(default_return=None)

diff --git a/pdfwf/parsers/pypdf.py b/pdfwf/parsers/pypdf.py
@@ -43,17 +43,28 @@ def __init__(self, config: PyPDFParserConfig) -> None:
     def extract_doi_info(self, input_str: str) -> str:
         """Extract doi from pypdf metadata entry (if present)."""
         match = re.search(r'(doi:\s*|doi\.org/)(\S+)', input_str)
-        if match:
-            return match.group(2)
-        else:
-            return ''
-
-    def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
-        """Wrap pypdf functionality."""
-        # open
+        return match.group(2) if match else ''
+
+    @exception_handler(default_return=None)
+    def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
+        """Parse a PDF file.
+
+        Parameters
+        ----------
+        pdf_path : str
+            Path to the PDF file to convert.
+
+        Returns
+        -------
+        tuple[str, dict[str, str]] | None
+            A tuple containing the full text of the PDF and the metadata
+            extracted from the PDF. If parsing fails, return None.
+        """
+        # TODO: This needs to be closed
+        # Open
         reader = PdfReader(pdf_path)
 
-        # scrape text
+        # Scrape text
         full_text = ''
         for page in reader.pages:
             full_text += page.extract_text(extraction_mode='layout')
@@ -65,27 +76,27 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
         )
         meta = reader.metadata
 
-        # metadata (available to pypdf)
+        # Metadata (available to pypdf)
         title = meta.get('/Title', '')
         authors = meta.get('/Author', '')
         createdate = meta.get('/CreationDate', '')
         keywords = meta.get('/Keywords', '')
+        # Use .get() to handle the missing DOI key
         doi = (
             meta.get('/doi', '')
             if meta.get('/doi', '') != ''
             else self.extract_doi_info(meta.get('/Subject', ''))
-        )  # Use .get() to handle the missing DOI key
+        )
         prod = meta.get('/Producer', '')
-        form = meta.get(
-            '/Format', ''
-        )  # Not included for pypdf, so we set it directly
+        # Not included for pypdf, so we set it directly
+        form = meta.get('/Format', '')
         abstract = (
             meta.get('/Subject', '')
             if len(meta.get('/Subject', '')) > self.abstract_threshold
             else ''
         )
 
-        # - assemble
+        # Assemble the metadata
         out_meta = {
             'title': title,
             'authors': authors,
@@ -98,28 +109,6 @@ def convert_single_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]]:
             'abstract': abstract,
         }
 
-        # full text & metadata entries
-        output = full_text, out_meta
-
-        return output
-
-    @exception_handler(default_return=None)
-    def parse_pdf(self, pdf_path: str) -> tuple[str, dict[str, str]] | None:
-        """Parse a PDF file and extract markdown.
-
-        Parameters
-        ----------
-        pdf_path : str
-            Path to the PDF file to convert.
-
-        Returns
-        -------
-        tuple[str, dict[str, str]] | None
-            A tuple containing the full text of the PDF and the metadata
-            extracted from the PDF. If parsing fails, return None.
-        """
-        full_text, out_meta = self.convert_single_pdf(pdf_path)
-
         return full_text, out_meta
 
     @exception_handler(default_return=None)