Skip to content

Commit

Permalink
Latest version of pdfplumber/pdfminer
Browse files Browse the repository at this point in the history
  • Loading branch information
ubmarco committed Mar 14, 2024
1 parent 8216672 commit 648dbbf
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 246 deletions.
2 changes: 1 addition & 1 deletion libpdf/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def get_explict_dest(pdf, dest_list):
return [dest_page_num, dest_rect_x, dest_rect_y]


def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf): # pylint: disable=too-many-branches
def update_ann_info(annotation_page_map, ann_resolved, page, idx_page, pdf) -> None: # pylint: disable=too-many-branches
"""
Fetch the name of annotation, annotation location on the page and destination of the link annotation.
Expand Down
19 changes: 12 additions & 7 deletions libpdf/extract.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
"""Core routines for PDF extraction."""

from __future__ import annotations

import itertools
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from typing import TYPE_CHECKING, List, Optional

import pdfplumber
import yaml
Expand Down Expand Up @@ -42,6 +44,9 @@
to_pdfplumber_bbox,
)

if TYPE_CHECKING:
from pdfplumber.page import Page as PdfplumberPage

LOG = logging.getLogger(__name__)


Expand Down Expand Up @@ -591,14 +596,15 @@ def extract_page_metadata(pdf):


def extract_figures(
pdf,
pages_list,
figure_dir,
) -> List[Figure]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up
pdf: pdfplumber.pdf.PDF,
pages_list: list[Page],
figure_dir: str,
) -> list[Figure]: # pylint: disable=too-many-nested-blocks, too-many-branches # local algorithm, easier to read when not split up
"""Extract figures in PDF."""
LOG.info("Extracting figures ...")
figure_list = []

page: PdfplumberPage
for idx_page, page in enumerate( # pylint: disable=too-many-nested-blocks
tqdm(
pdf.pages,
Expand All @@ -611,9 +617,8 @@ def extract_figures(
LOG.debug("Extracting figures page %s of %s", idx_page + 1, len(pdf.pages))
page_crop = pro.remove_page_header_footer(page)
lt_page = page._layout # pylint: disable=protected-access # easiest way to obtain LTPage

# check and filter figures
figures = check_and_filter_figures(page_crop.figures)
figures = check_and_filter_figures(page_crop.objects.get("image", []))

if len(figures) != 0:
for idx_figure, figure in enumerate(figures):
Expand Down
12 changes: 5 additions & 7 deletions libpdf/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
"""

import datetime
import decimal
import json
import logging
import os
import sys
from typing import Dict, List, Optional, Union

import ruamel.yaml
from pdfplumber.page import Page as PdfplumberPage
from ruamel.yaml.representer import RoundTripRepresenter

from libpdf import parameters
Expand All @@ -37,20 +37,18 @@
LOG = logging.getLogger(__name__)


def remove_page_header_footer(single_page):
def remove_page_header_footer(single_page: PdfplumberPage) -> PdfplumberPage:
"""Remove header and footer."""
page_crop = single_page.within_bbox(
return single_page.within_bbox(
(
0,
decimal.Decimal(parameters.PAGE_CROP_MARGINS["top"]),
parameters.PAGE_CROP_MARGINS["top"],
single_page.width,
single_page.height
- decimal.Decimal(parameters.PAGE_CROP_MARGINS["bottom"]),
- parameters.PAGE_CROP_MARGINS["bottom"],
),
)

return page_crop


class MyRepresenter(RoundTripRepresenter): # pylint: disable=too-few-public-methods
"""Customized representer of yaml."""
Expand Down
6 changes: 2 additions & 4 deletions libpdf/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,11 @@ def extract_pdf_table(pdf, pages_list: List[Page], figure_list: List[Figure]):
"edge_min_length": 3,
"min_words_vertical": 3,
"min_words_horizontal": 1,
"keep_blank_chars": False,
"text_keep_blank_chars": False,
"text_tolerance": 3,
"text_x_tolerance": 2,
"text_y_tolerance": 2,
"intersection_tolerance": 3,
"intersection_x_tolerance": None,
"intersection_y_tolerance": None,
}

table_dict = {"page": {}}
Expand Down Expand Up @@ -164,7 +162,7 @@ def extract_cells(lt_page: LTPage, rows: List, list_cell: List[Cell], page: Page
row_cell[1],
row_cell[2],
row_cell[3],
Decimal(lt_page.height),
lt_page.height,
)
pos_cell = Position(
pos_cell_bbox[0],
Expand Down
4 changes: 1 addition & 3 deletions libpdf/textbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,9 +957,7 @@ def pdfminer_get_lt_textboxes(pdf) -> Dict[int, List[LTTextBox]]:
if logging_needed(idx_page, len(pdf.pages)):
LOG.debug("Extracting layout page %s of %s", idx_page + 1, len(pdf.pages))

pdf.interpreter.process_page(page.page_obj)
result = pdf.device.get_result()
lt_textboxes = [obj for obj in result if isinstance(obj, LTTextBox)]
lt_textboxes = [obj for obj in page.layout._objs if isinstance(obj, LTTextBox)]
# remove detected header and footer lt_textboxes based on given page crop margin parameter
filter_lt_textboxes = list(
filter(
Expand Down
23 changes: 8 additions & 15 deletions libpdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,12 @@ def to_pdfplumber_bbox(
x1: float,
y1: float,
page_height: float,
) -> list[Decimal]:
) -> list[float]:
"""
Convert PDF standard or pdfminer bbox coordinates to pdfplumber bbox coordinates.
The function is needed because for pdfplumber:
- y coordinates are inverted
- Decimal type is needed
Some diagram may help::
Expand Down Expand Up @@ -180,20 +179,15 @@ def to_pdfplumber_bbox(
:param page_height: height of the page
:return: [x0, top, x1, bottom]
"""
# pylint: disable=invalid-name # short is better here
ret_x0 = Decimal(x0)
ret_y0 = Decimal(Decimal(page_height) - Decimal(y1))
ret_x1 = Decimal(x1)
ret_y1 = Decimal(Decimal(page_height) - Decimal(y0))
return [ret_x0, ret_y0, ret_x1, ret_y1]
return [x0, page_height - y1, x1, page_height - y0]


def from_pdfplumber_bbox(
x0: Decimal,
top: Decimal,
x1: Decimal,
bottom: Decimal,
page_height: Decimal,
x0: float,
top: float,
x1: float,
bottom: float,
page_height: float,
) -> list[float]:
"""
Convert pdfplumber bbox coordinates to PDF standard.
Expand All @@ -205,8 +199,7 @@ def from_pdfplumber_bbox(
:param page_height: height of the page
:return: [x0, y0, x1, y1]
"""
# pylint: disable=invalid-name # short is better here
return [float(x0), float(page_height - bottom), float(x1), float(page_height - top)]
return [x0, page_height - bottom, x1, page_height - top]


def check_lt_obj_in_bbox(
Expand Down
Loading

0 comments on commit 648dbbf

Please sign in to comment.