Latest version of pdfplumber/pdfminer

useblocks · Mar 14, 2024 · 648dbbf · 648dbbf
1 parent 8216672
commit 648dbbf
Show file tree

Hide file tree

Showing 8 changed files with 374 additions and 246 deletions.
diff --git a/libpdf/catalog.py b/libpdf/catalog.py
@@ -400,7 +400,7 @@ def get_explict_dest(pdf, dest_list):
     return [dest_page_num, dest_rect_x, dest_rect_y]
 
 
-def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf):  # pylint: disable=too-many-branches
+def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> None:  # pylint: disable=too-many-branches
     """
     Fetch the name of annotation, annotation location on the page and destination of the link annotation.
 

diff --git a/libpdf/extract.py b/libpdf/extract.py
@@ -1,11 +1,13 @@
 """Core routines for PDF extraction."""
 
+from __future__ import annotations
+
 import itertools
 import logging
 import os
 from datetime import datetime
 from pathlib import Path
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 import pdfplumber
 import yaml
@@ -42,6 +44,9 @@
     to_pdfplumber_bbox,
 )
 
+if TYPE_CHECKING:
+    from pdfplumber.page import Page as PdfplumberPage
+
 LOG = logging.getLogger(__name__)
 
 
@@ -591,14 +596,15 @@ def extract_page_metadata(pdf):
 
 
 def extract_figures(
-    pdf,
-    pages_list,
-    figure_dir,
-) -> List[Figure]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
+    pdf: pdfplumber.pdf.PDF,
+    pages_list: list[Page],
+    figure_dir: str,
+) -> list[Figure]:  # pylint: disable=too-many-nested-blocks, too-many-branches  # local algorithm, easier to read when not split up
     """Extract figures in PDF."""
     LOG.info("Extracting figures ...")
     figure_list = []
 
+    page: PdfplumberPage
     for idx_page, page in enumerate(  # pylint: disable=too-many-nested-blocks
         tqdm(
             pdf.pages,
@@ -611,9 +617,8 @@ def extract_figures(
             LOG.debug("Extracting figures page %s of %s", idx_page + 1, len(pdf.pages))
         page_crop = pro.remove_page_header_footer(page)
         lt_page = page._layout  # pylint: disable=protected-access  # easiest way to obtain LTPage
-
         # check and filter figures
-        figures = check_and_filter_figures(page_crop.figures)
+        figures = check_and_filter_figures(page_crop.objects.get("image", []))
 
         if len(figures) != 0:
             for idx_figure, figure in enumerate(figures):

diff --git a/libpdf/process.py b/libpdf/process.py
@@ -9,14 +9,14 @@
 """
 
 import datetime
-import decimal
 import json
 import logging
 import os
 import sys
 from typing import Dict, List, Optional, Union
 
 import ruamel.yaml
+from pdfplumber.page import Page as PdfplumberPage
 from ruamel.yaml.representer import RoundTripRepresenter
 
 from libpdf import parameters
@@ -37,20 +37,18 @@
 LOG = logging.getLogger(__name__)
 
 
-def remove_page_header_footer(single_page):
+def remove_page_header_footer(single_page: PdfplumberPage) -> PdfplumberPage:
     """Remove header and footer."""
-    page_crop = single_page.within_bbox(
+    return single_page.within_bbox(
         (
             0,
-            decimal.Decimal(parameters.PAGE_CROP_MARGINS["top"]),
+            parameters.PAGE_CROP_MARGINS["top"],
             single_page.width,
             single_page.height
-            - decimal.Decimal(parameters.PAGE_CROP_MARGINS["bottom"]),
+            - parameters.PAGE_CROP_MARGINS["bottom"],
         ),
     )
 
-    return page_crop
-
 
 class MyRepresenter(RoundTripRepresenter):  # pylint: disable=too-few-public-methods
     """Customized representer of yaml."""

diff --git a/libpdf/tables.py b/libpdf/tables.py
@@ -69,13 +69,11 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]):
         "edge_min_length": 3,
         "min_words_vertical": 3,
         "min_words_horizontal": 1,
-        "keep_blank_chars": False,
+        "text_keep_blank_chars": False,
         "text_tolerance": 3,
         "text_x_tolerance": 2,
         "text_y_tolerance": 2,
         "intersection_tolerance": 3,
-        "intersection_x_tolerance": None,
-        "intersection_y_tolerance": None,
     }
 
     table_dict = {"page": {}}
@@ -164,7 +162,7 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page
                     row_cell[1],
                     row_cell[2],
                     row_cell[3],
-                    Decimal(lt_page.height),
+                    lt_page.height,
                 )
                 pos_cell = Position(
                     pos_cell_bbox[0],

diff --git a/libpdf/textbox.py b/libpdf/textbox.py
@@ -957,9 +957,7 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
         if logging_needed(idx_page, len(pdf.pages)):
             LOG.debug("Extracting layout page %s of %s", idx_page + 1, len(pdf.pages))
 
-        pdf.interpreter.process_page(page.page_obj)
-        result = pdf.device.get_result()
-        lt_textboxes = [obj for obj in result if isinstance(obj, LTTextBox)]
+        lt_textboxes = [obj for obj in page.layout._objs if isinstance(obj, LTTextBox)]
         # remove detected header and footer lt_textboxes based on given page crop margin parameter
         filter_lt_textboxes = list(
             filter(

diff --git a/libpdf/utils.py b/libpdf/utils.py
@@ -144,13 +144,12 @@ def to_pdfplumber_bbox(
     x1: float,
     y1: float,
     page_height: float,
-) -> list[Decimal]:
+) -> list[float]:
     """
     Convert PDF standard or pdfminer bbox coordinates to pdfplumber bbox coordinates.
 
     The function is needed because for pdfplumber:
     - y coordinates are inverted
-    - Decimal type is needed
 
     Some diagram may help::
 
@@ -180,20 +179,15 @@ def to_pdfplumber_bbox(
     :param page_height: height of the page
     :return: [x0, top, x1, bottom]
     """
-    # pylint: disable=invalid-name  # short is better here
-    ret_x0 = Decimal(x0)
-    ret_y0 = Decimal(Decimal(page_height) - Decimal(y1))
-    ret_x1 = Decimal(x1)
-    ret_y1 = Decimal(Decimal(page_height) - Decimal(y0))
-    return [ret_x0, ret_y0, ret_x1, ret_y1]
+    return [x0, page_height - y1, x1, page_height - y0]
 
 
 def from_pdfplumber_bbox(
-    x0: Decimal,
-    top: Decimal,
-    x1: Decimal,
-    bottom: Decimal,
-    page_height: Decimal,
+    x0: float,
+    top: float,
+    x1: float,
+    bottom: float,
+    page_height: float,
 ) -> list[float]:
     """
     Convert pdfplumber bbox coordinates to PDF standard.
@@ -205,8 +199,7 @@ def from_pdfplumber_bbox(
     :param page_height: height of the page
     :return: [x0, y0, x1, y1]
     """
-    # pylint: disable=invalid-name  # short is better here
-    return [float(x0), float(page_height - bottom), float(x1), float(page_height - top)]
+    return [x0, page_height - bottom, x1, page_height - top]
 
 
 def check_lt_obj_in_bbox(