From dfa4b9e5cdfeb0cb645ba7c9184c5b522bdaae2a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 30 Sep 2024 22:41:27 -0400 Subject: [PATCH 01/34] feat: use playa instead of pdfminer --- pdfplumber/__init__.py | 4 ---- pdfplumber/convert.py | 2 +- pdfplumber/page.py | 10 +++++----- pdfplumber/pdf.py | 15 +++++++-------- pdfplumber/structure.py | 10 +++++----- pdfplumber/utils/pdfinternals.py | 20 +++----------------- tests/test_structure.py | 2 +- tests/test_utils.py | 4 ++-- 8 files changed, 24 insertions(+), 43 deletions(-) diff --git a/pdfplumber/__init__.py b/pdfplumber/__init__.py index 9fd52591..878ed0c8 100644 --- a/pdfplumber/__init__.py +++ b/pdfplumber/__init__.py @@ -1,15 +1,11 @@ __all__ = [ "__version__", "utils", - "pdfminer", "open", "repair", "set_debug", ] -import pdfminer -import pdfminer.pdftypes - from . import utils from ._version import __version__ from .pdf import PDF diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index 5c285d09..fbaf8ce8 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -1,7 +1,7 @@ import base64 from typing import Any, Callable, Dict, List, Optional, Tuple -from pdfminer.psparser import PSLiteral +from playa.psparser import PSLiteral from .utils import decode_text diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 34dae7d3..93f7cc0a 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -15,8 +15,8 @@ from unicodedata import normalize as normalize_unicode from warnings import warn -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import ( +from playa.converter import PDFPageAggregator +from playa.layout import ( LTChar, LTComponent, LTContainer, @@ -25,9 +25,9 @@ LTPage, LTTextContainer, ) -from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT -from pdfminer.pdfpage import PDFPage -from pdfminer.psparser import PSLiteral +from playa.pdfinterp import PDFPageInterpreter, PDFStackT +from playa.pdfpage import PDFPage +from playa.psparser import PSLiteral from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 3252dde2..afecce06 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -5,12 +5,11 @@ from types import TracebackType from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union -from pdfminer.layout import LAParams -from pdfminer.pdfdocument import PDFDocument -from pdfminer.pdfinterp import PDFResourceManager -from pdfminer.pdfpage import PDFPage -from pdfminer.pdfparser import PDFParser -from pdfminer.psparser import PSException +from playa.layout import LAParams +from playa.pdfdocument import PDFDocument +from playa.pdfinterp import PDFResourceManager +from playa.pdfpage import PDFPage +from playa.psparser import PSException from ._typing import T_num, T_obj_list from .container import Container @@ -46,7 +45,7 @@ def __init__( self.unicode_norm = unicode_norm self.raise_unicode_errors = raise_unicode_errors - self.doc = PDFDocument(PDFParser(stream), password=password or "") + self.doc = PDFDocument(stream, password=password or "") self.rsrcmgr = PDFResourceManager() self.metadata = {} @@ -146,7 +145,7 @@ def pages(self) -> List[Page]: doctop: T_num = 0 pp = self.pages_to_parse self._pages: List[Page] = [] - for i, page in enumerate(PDFPage.create_pages(self.doc)): + for i, page in enumerate(self.doc.get_pages()): page_number = i + 1 if pp is not None and page_number not in pp: continue diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py index 88749d76..45f25daf 100644 --- a/pdfplumber/structure.py +++ b/pdfplumber/structure.py @@ -17,10 +17,10 @@ Union, ) -from pdfminer.data_structures import NumberTree -from pdfminer.pdfparser import PDFParser -from pdfminer.pdftypes import PDFObjRef, resolve1 -from pdfminer.psparser import PSLiteral +from playa.data_structures import NumberTree +from playa.pdfparser import KEYWORD_NULL +from playa.pdftypes import PDFObjRef, resolve1 +from playa.psparser import PSLiteral from ._typing import T_bbox, T_obj from .utils import decode_text, geometry @@ -316,7 +316,7 @@ def _parse_parent_tree(self, parent_array: List[Any]) -> None: ref = d.popleft() # In the case where an MCID is not associated with any # structure, there will be a "null" in the parent tree. - if ref == PDFParser.KEYWORD_NULL: + if ref == KEYWORD_NULL: continue if repr(ref) in s: continue diff --git a/pdfplumber/utils/pdfinternals.py b/pdfplumber/utils/pdfinternals.py index 2ba50643..f2460c9b 100644 --- a/pdfplumber/utils/pdfinternals.py +++ b/pdfplumber/utils/pdfinternals.py @@ -1,22 +1,8 @@ from typing import Any, List, Optional, Union -from pdfminer.pdftypes import PDFObjRef -from pdfminer.psparser import PSLiteral -from pdfminer.utils import PDFDocEncoding - - -def decode_text(s: Union[bytes, str]) -> str: - """ - Decodes a PDFDocEncoding string to Unicode. - Adds py3 compatibility to pdfminer's version. - """ - if isinstance(s, bytes) and s.startswith(b"\xfe\xff"): - return str(s[2:], "utf-16be", "ignore") - try: - ords = (ord(c) if isinstance(c, str) else c for c in s) - return "".join(PDFDocEncoding[o] for o in ords) - except IndexError: - return str(s) +from playa.pdftypes import PDFObjRef +from playa.psparser import PSLiteral +from playa.utils import PDFDocEncoding, decode_text def resolve_and_decode(obj: Any) -> Any: diff --git a/tests/test_structure.py b/tests/test_structure.py index 4933a052..39489fad 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -5,7 +5,7 @@ import unittest from collections import deque -from pdfminer.pdftypes import resolve1 +from playa.pdftypes import resolve1 import pdfplumber from pdfplumber.structure import PDFStructTree diff --git a/tests/test_utils.py b/tests/test_utils.py index 81fc5c8e..80402e66 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,8 +8,8 @@ import pandas as pd import pytest -from pdfminer.pdfparser import PDFObjRef -from pdfminer.psparser import PSLiteral +from playa.pdfparser import PDFObjRef +from playa.psparser import PSLiteral import pdfplumber from pdfplumber import utils From ddd0532507af11d748f4f997c3959ec6973b9dba Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Mon, 30 Sep 2024 22:58:43 -0400 Subject: [PATCH 02/34] feat: playa does the right thing for mcids --- pdfplumber/page.py | 53 +--------------------------------------------- 1 file changed, 1 insertion(+), 52 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 93f7cc0a..e5aeffdf 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -128,57 +128,6 @@ def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: } -class PDFPageAggregatorWithMarkedContent(PDFPageAggregator): - """Extract layout from a specific page, adding marked-content IDs to - objects where found.""" - - cur_mcid: Optional[int] = None - cur_tag: Optional[str] = None - - def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: - """Handle beginning of tag, setting current MCID if any.""" - self.cur_tag = decode_text(tag.name) - if isinstance(props, dict) and "MCID" in props: - self.cur_mcid = props["MCID"] - else: - self.cur_mcid = None - - def end_tag(self) -> None: - """Handle beginning of tag, clearing current MCID.""" - self.cur_tag = None - self.cur_mcid = None - - def tag_cur_item(self) -> None: - """Add current MCID to what we hope to be the most recent object created - by pdfminer.six.""" - # This is somewhat hacky and would not be necessary if - # pdfminer.six supported MCIDs. In reading the code it's - # clear that the `render_*` methods methods will only ever - # create one object, but that is far from being guaranteed. - # Even if pdfminer.six's API would just return the objects it - # creates, we wouldn't have to do this. - if self.cur_item._objs: - cur_obj = self.cur_item._objs[-1] - cur_obj.mcid = self.cur_mcid # type: ignore - cur_obj.tag = self.cur_tag # type: ignore - - def render_char(self, *args, **kwargs) -> float: # type: ignore - """Hook for rendering characters, adding the `mcid` attribute.""" - adv = super().render_char(*args, **kwargs) - self.tag_cur_item() - return adv - - def render_image(self, *args, **kwargs) -> None: # type: ignore - """Hook for rendering images, adding the `mcid` attribute.""" - super().render_image(*args, **kwargs) - self.tag_cur_item() - - def paint_path(self, *args, **kwargs) -> None: # type: ignore - """Hook for rendering lines and curves, adding the `mcid` attribute.""" - super().paint_path(*args, **kwargs) - self.tag_cur_item() - - def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox: # Per PDF Reference 3.8.4: "Note: Although rectangles are # conventionally specified by their lower-left and upperright @@ -270,7 +219,7 @@ def structure_tree(self) -> List[Dict[str, Any]]: def layout(self) -> LTPage: if hasattr(self, "_layout"): return self._layout - device = PDFPageAggregatorWithMarkedContent( + device = PDFPageAggregator( self.pdf.rsrcmgr, pageno=self.page_number, laparams=self.pdf.laparams, From 87a0fa3d45a63b131786285012164d2b4bf99d3b Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 1 Oct 2024 09:25:00 -0400 Subject: [PATCH 03/34] fix: playa exposes ncs/scs --- pdfplumber/page.py | 8 ++------ tests/test_convert.py | 6 +++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index e5aeffdf..501e95de 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -327,13 +327,9 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: attr["page_number"] = self.page_number for cs in ["ncs", "scs"]: - # Note: As of pdfminer.six v20221105, that library only - # exposes ncs for LTChars, and neither attribute for - # other objects. Keeping this code here, though, - # for ease of addition if color spaces become - # more available via pdfminer.six if hasattr(obj, cs): - attr[cs] = resolve_and_decode(getattr(obj, cs).name) + csobj = getattr(obj, cs) + attr[cs] = resolve_and_decode(csobj.name) for color_attr, pattern_attr in [ ("stroking_color", "stroking_pattern"), diff --git a/tests/test_convert.py b/tests/test_convert.py index 2508b7ed..ea9d6164 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -178,7 +178,7 @@ def test_csv(self): assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0, 0, 0)",,,,18.0,,,,,,,Y,,1,' + ',,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,,,,Y,,1,' ) io = StringIO() @@ -245,7 +245,7 @@ def test_cli_csv(self): assert res.decode("utf-8").split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0, 0, 0)",,,,18.0,,,,,,,Y,,1,' + ',,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,,,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -271,7 +271,7 @@ def test_cli_csv_exclude(self): assert res.decode("utf-8").split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT," - ',,,"(0, 0, 0)",,,18.0,,,,,,Y,,1,' + ',,"(0, 0, 0)",,,DeviceGray,18.0,,,,,,Y,,1,' ) def test_cli_csv_include(self): From fffd551af614dce0dfc7a963fc2b30f176555c89 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 1 Oct 2024 16:18:08 -0400 Subject: [PATCH 04/34] fix: update to handle parsed pages --- pdfplumber/page.py | 14 +++-- pdfplumber/pdf.py | 13 +++-- tests/test_structure.py | 112 ---------------------------------------- 3 files changed, 18 insertions(+), 121 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 501e95de..243373cb 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -25,14 +25,15 @@ LTPage, LTTextContainer, ) -from playa.pdfinterp import PDFPageInterpreter, PDFStackT +from playa.pdfinterp import PDFPageInterpreter from playa.pdfpage import PDFPage from playa.psparser import PSLiteral +from playa.pdfstructtree import PDFStructTree +from playa.exceptions import PDFNoStructTree from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list from .container import Container -from .structure import PDFStructTree, StructTreeMissing from .table import T_table_settings, Table, TableFinder, TableSettings from .utils import decode_text, resolve_all, resolve_and_decode from .utils.text import TextMap @@ -211,8 +212,13 @@ def height(self) -> T_num: def structure_tree(self) -> List[Dict[str, Any]]: """Return the structure tree for a page, if any.""" try: - return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)] - except StructTreeMissing: + return [ + elem.to_dict() + for elem in PDFStructTree( + self.pdf.doc, [(None, self.page_obj)] + ) + ] + except PDFNoStructTree: return [] @property diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index afecce06..42af3965 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -8,14 +8,13 @@ from playa.layout import LAParams from playa.pdfdocument import PDFDocument from playa.pdfinterp import PDFResourceManager -from playa.pdfpage import PDFPage -from playa.psparser import PSException +from playa.exceptions import PSException, PDFNoStructTree +from playa.pdfstructtree import PDFStructTree from ._typing import T_num, T_obj_list from .container import Container from .page import Page from .repair import T_repair_setting, _repair -from .structure import PDFStructTree, StructTreeMissing from .utils import resolve_and_decode logger = logging.getLogger(__name__) @@ -179,8 +178,12 @@ def hyperlinks(self) -> List[Dict[str, Any]]: def structure_tree(self) -> List[Dict[str, Any]]: """Return the structure tree for the document.""" try: - return [elem.to_dict() for elem in PDFStructTree(self)] - except StructTreeMissing: + if self.pages_to_parse is None: + numbered_pages = None + else: + numbered_pages = zip(self.pages_to_parse, (p.page_obj for p in self.pages)) + return [elem.to_dict() for elem in PDFStructTree(self.doc, numbered_pages)] + except PDFNoStructTree: return [] def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]: diff --git a/tests/test_structure.py b/tests/test_structure.py index 39489fad..0d64467f 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -1,14 +1,12 @@ #!/usr/bin/env python3 import os -import re import unittest from collections import deque from playa.pdftypes import resolve1 import pdfplumber -from pdfplumber.structure import PDFStructTree HERE = os.path.abspath(os.path.dirname(__file__)) TREE = [ @@ -854,116 +852,6 @@ def test_structure_tree(self): ] -class TestClass(unittest.TestCase): - """Test the underlying Structure tree class""" - - def test_structure_tree_class(self): - path = os.path.join(HERE, "pdfs/image_structure.pdf") - pdf = pdfplumber.open(path) - stree = PDFStructTree(pdf, pdf.pages[0]) - doc_elem = next(iter(stree)) - assert [k.type for k in doc_elem] == ["P", "P", "Figure"] - - def test_find_all_tree(self): - """ - Test find_all() and find() on trees - """ - path = os.path.join(HERE, "pdfs/image_structure.pdf") - pdf = pdfplumber.open(path) - stree = PDFStructTree(pdf, pdf.pages[0]) - figs = list(stree.find_all("Figure")) - assert len(figs) == 1 - fig = stree.find("Figure") - assert fig == figs[0] - assert stree.find("Fogure") is None - figs = list(stree.find_all(re.compile(r"Fig.*"))) - assert len(figs) == 1 - figs = list(stree.find_all(lambda x: x.type == "Figure")) - assert len(figs) == 1 - figs = list(stree.find_all("Foogure")) - assert len(figs) == 0 - figs = list(stree.find_all(re.compile(r"Fog.*"))) - assert len(figs) == 0 - figs = list(stree.find_all(lambda x: x.type == "Flogger")) - assert len(figs) == 0 - - def test_find_all_element(self): - """ - Test find_all() and find() on elements - """ - path = os.path.join(HERE, "pdfs/pdf_structure.pdf") - pdf = pdfplumber.open(path) - stree = PDFStructTree(pdf) - for list_elem in stree.find_all("L"): - items = list(list_elem.find_all("LI")) - assert items - for item in items: - body = list(item.find_all("LBody")) - assert body - body1 = item.find("LBody") - assert body1 == body[0] - assert item.find("Loonie") is None - - def test_all_mcids(self): - """ - Test all_mcids() - """ - path = os.path.join(HERE, "pdfs/2023-06-20-PV.pdf") - pdf = pdfplumber.open(path) - # Make sure we can get them with page numbers - stree = PDFStructTree(pdf) - sect = next(stree.find_all("Sect")) - mcids = list(sect.all_mcids()) - pages = set(page for page, mcid in mcids) - assert 1 in pages - assert 2 in pages - # If we take only a single page there are no page numbers - # (FIXME: may wish to reconsider this API decision...) - page = pdf.pages[1] - stree = PDFStructTree(pdf, page) - sect = next(stree.find_all("Sect")) - mcids = list(sect.all_mcids()) - pages = set(page for page, mcid in mcids) - assert None in pages - assert 1 not in pages - assert 2 not in pages - # Assure that we get the MCIDs for a content element - for p in sect.find_all("P"): - assert set(mcid for page, mcid in p.all_mcids()) == set(p.mcids) - - def test_element_bbox(self): - """ - Test various ways of getting element bboxes - """ - path = os.path.join(HERE, "pdfs/pdf_structure.pdf") - pdf = pdfplumber.open(path) - stree = PDFStructTree(pdf) - # As BBox attribute - table = next(stree.find_all("Table")) - assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25) - # With child elements - tr = next(table.find_all("TR")) - assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9) - # From a specific page it should also work - stree = PDFStructTree(pdf, pdf.pages[0]) - table = next(stree.find_all("Table")) - assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 555.3, 542.25) - tr = next(table.find_all("TR")) - assert tuple(stree.element_bbox(tr)) == (56.8, 495.9, 328.312, 507.9) - # Yeah but what happens if you crop the page? - page = pdf.pages[0].crop((10, 400, 500, 500)) - stree = PDFStructTree(pdf, page) - table = next(stree.find_all("Table")) - # The element gets cropped too - assert tuple(stree.element_bbox(table)) == (56.7, 489.9, 500, 500) - # And if you crop it out of the page? - page = pdf.pages[0].crop((0, 0, 560, 400)) - stree = PDFStructTree(pdf, page) - table = next(stree.find_all("Table")) - with self.assertRaises(IndexError): - _ = stree.element_bbox(table) - - class TestUnparsed(unittest.TestCase): """Test handling of PDFs with unparsed pages.""" From e64d50933305ae99bc66b4711bec0ce9b568173a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 1 Oct 2024 18:41:38 -0400 Subject: [PATCH 05/34] chore: format, lint --- pdfplumber/page.py | 8 +++----- pdfplumber/pdf.py | 6 ++++-- pdfplumber/utils/pdfinternals.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 243373cb..84cbae09 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -16,6 +16,7 @@ from warnings import warn from playa.converter import PDFPageAggregator +from playa.exceptions import PDFNoStructTree from playa.layout import ( LTChar, LTComponent, @@ -27,9 +28,8 @@ ) from playa.pdfinterp import PDFPageInterpreter from playa.pdfpage import PDFPage -from playa.psparser import PSLiteral from playa.pdfstructtree import PDFStructTree -from playa.exceptions import PDFNoStructTree +from playa.psparser import PSLiteral from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list @@ -214,9 +214,7 @@ def structure_tree(self) -> List[Dict[str, Any]]: try: return [ elem.to_dict() - for elem in PDFStructTree( - self.pdf.doc, [(None, self.page_obj)] - ) + for elem in PDFStructTree(self.pdf.doc, [(None, self.page_obj)]) ] except PDFNoStructTree: return [] diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 42af3965..bd8e9cea 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -5,10 +5,10 @@ from types import TracebackType from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union +from playa.exceptions import PDFNoStructTree, PSException from playa.layout import LAParams from playa.pdfdocument import PDFDocument from playa.pdfinterp import PDFResourceManager -from playa.exceptions import PSException, PDFNoStructTree from playa.pdfstructtree import PDFStructTree from ._typing import T_num, T_obj_list @@ -181,7 +181,9 @@ def structure_tree(self) -> List[Dict[str, Any]]: if self.pages_to_parse is None: numbered_pages = None else: - numbered_pages = zip(self.pages_to_parse, (p.page_obj for p in self.pages)) + numbered_pages = zip( + self.pages_to_parse, (p.page_obj for p in self.pages) + ) return [elem.to_dict() for elem in PDFStructTree(self.doc, numbered_pages)] except PDFNoStructTree: return [] diff --git a/pdfplumber/utils/pdfinternals.py b/pdfplumber/utils/pdfinternals.py index f2460c9b..50413f10 100644 --- a/pdfplumber/utils/pdfinternals.py +++ b/pdfplumber/utils/pdfinternals.py @@ -2,7 +2,7 @@ from playa.pdftypes import PDFObjRef from playa.psparser import PSLiteral -from playa.utils import PDFDocEncoding, decode_text +from playa.utils import decode_text def resolve_and_decode(obj: Any) -> Any: From 5991da36dcb27149e210094f3b8b9bf2402f3393 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 1 Oct 2024 18:48:59 -0400 Subject: [PATCH 06/34] fix(deps): switch to unreleased playa --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7f2653dd..33edfdad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -pdfminer.six==20231228 +playa @ git+https://github.com/dhdaines/playa.git Pillow>=9.1 pypdfium2>=4.18.0 From f5ac9f89fb3342af7f9f59e04dc84a5ea3293cc2 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 22 Oct 2024 09:24:39 -0400 Subject: [PATCH 07/34] feat: playa exposes these now (but... for how long) --- pdfplumber/page.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 84cbae09..f801076a 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -351,17 +351,6 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: ) if isinstance(obj, LTChar): - # pdfminer.six (at least as of v20221105) does not - # directly expose .stroking_color and .non_stroking_color - # for LTChar objects (unlike, e.g., LTRect objects). - gs = obj.graphicstate - attr["stroking_color"], attr["stroking_pattern"] = normalize_color( - gs.scolor - ) - attr["non_stroking_color"], attr["non_stroking_pattern"] = normalize_color( - gs.ncolor - ) - # Handle (rare) byte-encoded fontnames if isinstance(attr["fontname"], bytes): attr["fontname"] = fix_fontname_bytes(attr["fontname"]) From b05d6e267aec088dde37d454997026ce68b328be Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 22 Oct 2024 09:24:48 -0400 Subject: [PATCH 08/34] fix: new API --- pdfplumber/pdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index bd8e9cea..bc131c31 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -144,7 +144,7 @@ def pages(self) -> List[Page]: doctop: T_num = 0 pp = self.pages_to_parse self._pages: List[Page] = [] - for i, page in enumerate(self.doc.get_pages()): + for i, page in enumerate(self.doc.pages): page_number = i + 1 if pp is not None and page_number not in pp: continue From 36e28cb777bca3b7762388f50b51c863c8ed8225 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 23 Oct 2024 11:14:26 -0400 Subject: [PATCH 09/34] feat!: remove custom LAParams (just use pdfminer if you want them) --- pdfplumber/page.py | 9 +------- pdfplumber/pdf.py | 2 -- tests/test_basics.py | 7 ------ tests/test_laparams.py | 51 ------------------------------------------ 4 files changed, 1 insertion(+), 68 deletions(-) delete mode 100644 tests/test_laparams.py diff --git a/pdfplumber/page.py b/pdfplumber/page.py index f801076a..e047e78d 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -24,7 +24,6 @@ LTCurve, LTItem, LTPage, - LTTextContainer, ) from playa.pdfinterp import PDFPageInterpreter from playa.pdfpage import PDFPage @@ -226,7 +225,6 @@ def layout(self) -> LTPage: device = PDFPageAggregator( self.pdf.rsrcmgr, pageno=self.page_number, - laparams=self.pdf.laparams, ) interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device) interpreter.process_page(self.page_obj) @@ -342,15 +340,13 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: if color_attr in attr: attr[color_attr], attr[pattern_attr] = normalize_color(attr[color_attr]) - if isinstance(obj, (LTChar, LTTextContainer)): + if isinstance(obj, LTChar): text = obj.get_text() attr["text"] = ( normalize_unicode(self.pdf.unicode_norm, text) if self.pdf.unicode_norm is not None else text ) - - if isinstance(obj, LTChar): # Handle (rare) byte-encoded fontnames if isinstance(attr["fontname"], bytes): attr["fontname"] = fix_fontname_bytes(attr["fontname"]) @@ -386,9 +382,6 @@ def iter_layout_objects( for obj in layout_objects: # If object is, like LTFigure, a higher-level object ... if isinstance(obj, LTContainer): - # and LAParams is passed, process the object itself. - if self.pdf.laparams is not None: - yield self.process_object(obj) # Regardless, iterate through its children yield from self.iter_layout_objects(obj._objs) else: diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index bc131c31..6438e6dd 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -6,7 +6,6 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union from playa.exceptions import PDFNoStructTree, PSException -from playa.layout import LAParams from playa.pdfdocument import PDFDocument from playa.pdfinterp import PDFResourceManager from playa.pdfstructtree import PDFStructTree @@ -39,7 +38,6 @@ def __init__( self.stream_is_external = stream_is_external self.path = path self.pages_to_parse = pages - self.laparams = None if laparams is None else LAParams(**laparams) self.password = password self.unicode_norm = unicode_norm self.raise_unicode_errors = raise_unicode_errors diff --git a/tests/test_basics.py b/tests/test_basics.py index 98932280..8af8b690 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -213,13 +213,6 @@ def test_text_colors(self): char = self.pdf.pages[0].chars[3358] assert char["non_stroking_color"] == (1, 0, 0) - def test_load_with_custom_laparams(self): - # See https://github.com/jsvine/pdfplumber/issues/168 - path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf") - laparams = dict(line_margin=0.2) - with pdfplumber.open(path, laparams=laparams) as pdf: - assert round(pdf.pages[0].chars[0]["top"], 3) == 66.384 - def test_loading_pathobj(self): from pathlib import Path diff --git a/tests/test_laparams.py b/tests/test_laparams.py deleted file mode 100644 index 1f11a31e..00000000 --- a/tests/test_laparams.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python -import logging -import os -import unittest - -import pdfplumber - -logging.disable(logging.ERROR) - -HERE = os.path.abspath(os.path.dirname(__file__)) - - -class Test(unittest.TestCase): - @classmethod - def setup_class(self): - self.path = os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf") - - def test_without_laparams(self): - with pdfplumber.open(self.path, laparams=None) as pdf: - objs = pdf.pages[0].objects - assert "textboxhorizontal" not in objs.keys() - assert len(objs["char"]) == 4408 - - def test_with_laparams(self): - with pdfplumber.open(self.path, laparams={}) as pdf: - page = pdf.pages[0] - assert len(page.textboxhorizontals) == 27 - assert len(page.textlinehorizontals) == 79 - assert "text" in page.textboxhorizontals[0] - assert "text" in page.textlinehorizontals[0] - assert len(page.chars) == 4408 - assert "anno" not in page.objects.keys() - - def test_vertical_texts(self): - path = os.path.join(HERE, "pdfs/issue-192-example.pdf") - laparams = {"detect_vertical": True} - with pdfplumber.open(path, laparams=laparams) as pdf: - page = pdf.pages[0] - assert len(page.textlinehorizontals) == 142 - assert len(page.textboxhorizontals) == 74 - assert len(page.textlineverticals) == 11 - assert len(page.textboxverticals) == 6 - assert "text" in page.textboxverticals[0] - assert "text" in page.textlineverticals[0] - - def test_issue_383(self): - with pdfplumber.open(self.path, laparams={}) as pdf: - p0 = pdf.pages[0] - assert "anno" not in p0.objects.keys() - cropped = p0.crop((0, 0, 100, 100)) - assert len(cropped.objects) From d6b51065fe98236e244bfea8bc9eb44712b7579d Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 23 Oct 2024 12:52:05 -0400 Subject: [PATCH 10/34] refactor!: another useless pdfminer API removed --- pdfplumber/page.py | 23 +++-------------------- pdfplumber/pdf.py | 2 -- tests/test_basics.py | 4 +++- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index e047e78d..603f6466 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -15,17 +15,8 @@ from unicodedata import normalize as normalize_unicode from warnings import warn -from playa.converter import PDFPageAggregator from playa.exceptions import PDFNoStructTree -from playa.layout import ( - LTChar, - LTComponent, - LTContainer, - LTCurve, - LTItem, - LTPage, -) -from playa.pdfinterp import PDFPageInterpreter +from playa.layout import LTChar, LTComponent, LTContainer, LTCurve, LTItem, LTPage from playa.pdfpage import PDFPage from playa.pdfstructtree import PDFStructTree from playa.psparser import PSLiteral @@ -220,16 +211,8 @@ def structure_tree(self) -> List[Dict[str, Any]]: @property def layout(self) -> LTPage: - if hasattr(self, "_layout"): - return self._layout - device = PDFPageAggregator( - self.pdf.rsrcmgr, - pageno=self.page_number, - ) - interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device) - interpreter.process_page(self.page_obj) - self._layout: LTPage = device.get_result() - return self._layout + # PLAYA will cache it for us + return self.page_obj.layout @property def annots(self) -> T_obj_list: diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 6438e6dd..f03236c7 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -7,7 +7,6 @@ from playa.exceptions import PDFNoStructTree, PSException from playa.pdfdocument import PDFDocument -from playa.pdfinterp import PDFResourceManager from playa.pdfstructtree import PDFStructTree from ._typing import T_num, T_obj_list @@ -43,7 +42,6 @@ def __init__( self.raise_unicode_errors = raise_unicode_errors self.doc = PDFDocument(stream, password=password or "") - self.rsrcmgr = PDFResourceManager() self.metadata = {} for info in self.doc.info: diff --git a/tests/test_basics.py b/tests/test_basics.py index 8af8b690..48f2aa03 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -46,7 +46,9 @@ def test_objects(self): # Ensure that caching is working: assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges) assert id(self.pdf_2._curve_edges) == id(self.pdf_2.curve_edges) - assert id(self.pdf.pages[0]._layout) == id(self.pdf.pages[0].layout) + assert id(self.pdf.pages[0].page_obj._layout) == id( + self.pdf.pages[0].page_obj.layout + ) def test_annots(self): pdf = self.pdf_2 From c0f50c2493206b738a221ea6236c83af9c8d31e2 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 23 Oct 2024 12:52:18 -0400 Subject: [PATCH 11/34] fix: numbertree is just iterable --- pdfplumber/structure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py index 45f25daf..75c4a60f 100644 --- a/pdfplumber/structure.py +++ b/pdfplumber/structure.py @@ -202,7 +202,7 @@ def __init__(self, doc: "PDF", page: Optional["Page"] = None): parent_id = self.page.page_obj.attrs["StructParents"] # NumberTree should have a `get` method like it does in pdf.js... parent_array = resolve1( - next(array for num, array in parent_tree.values if num == parent_id) + next(array for num, array in parent_tree if num == parent_id) ) self._parse_parent_tree(parent_array) else: From a2aeeb3cf917a798ee80f26a283dca74f645c2fc Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 23 Oct 2024 12:55:42 -0400 Subject: [PATCH 12/34] refactor!: remove structure as it is in playa --- pdfplumber/structure.py | 511 ---------------------------------------- 1 file changed, 511 deletions(-) delete mode 100644 pdfplumber/structure.py diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py deleted file mode 100644 index 75c4a60f..00000000 --- a/pdfplumber/structure.py +++ /dev/null @@ -1,511 +0,0 @@ -import itertools -import logging -import re -from collections import deque -from dataclasses import asdict, dataclass, field -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Iterable, - Iterator, - List, - Optional, - Pattern, - Tuple, - Union, -) - -from playa.data_structures import NumberTree -from playa.pdfparser import KEYWORD_NULL -from playa.pdftypes import PDFObjRef, resolve1 -from playa.psparser import PSLiteral - -from ._typing import T_bbox, T_obj -from .utils import decode_text, geometry - -logger = logging.getLogger(__name__) - - -if TYPE_CHECKING: # pragma: nocover - from .page import Page - from .pdf import PDF - - -MatchFunc = Callable[["PDFStructElement"], bool] - - -def _find_all( - elements: Iterable["PDFStructElement"], - matcher: Union[str, Pattern[str], MatchFunc], -) -> Iterator["PDFStructElement"]: - """ - Common code for `find_all()` in trees and elements. - """ - - def match_tag(x: "PDFStructElement") -> bool: - """Match an element name.""" - return x.type == matcher - - def match_regex(x: "PDFStructElement") -> bool: - """Match an element name by regular expression.""" - return matcher.match(x.type) # type: ignore - - if isinstance(matcher, str): - match_func = match_tag - elif isinstance(matcher, re.Pattern): - match_func = match_regex - else: - match_func = matcher # type: ignore - d = deque(elements) - while d: - el = d.popleft() - if match_func(el): - yield el - d.extendleft(reversed(el.children)) - - -class Findable: - """find() and find_all() methods that can be inherited to avoid - repeating oneself""" - - children: List["PDFStructElement"] - - def find_all( - self, matcher: Union[str, Pattern[str], MatchFunc] - ) -> Iterator["PDFStructElement"]: - """Iterate depth-first over matching elements in subtree. - - The `matcher` argument is either an element name, a regular - expression, or a function taking a `PDFStructElement` and - returning `True` if the element matches. - """ - return _find_all(self.children, matcher) - - def find( - self, matcher: Union[str, Pattern[str], MatchFunc] - ) -> Optional["PDFStructElement"]: - """Find the first matching element in subtree. - - The `matcher` argument is either an element name, a regular - expression, or a function taking a `PDFStructElement` and - returning `True` if the element matches. - """ - try: - return next(_find_all(self.children, matcher)) - except StopIteration: - return None - - -@dataclass -class PDFStructElement(Findable): - type: str - revision: Optional[int] - id: Optional[str] - lang: Optional[str] - alt_text: Optional[str] - actual_text: Optional[str] - title: Optional[str] - page_number: Optional[int] - attributes: Dict[str, Any] = field(default_factory=dict) - mcids: List[int] = field(default_factory=list) - children: List["PDFStructElement"] = field(default_factory=list) - - def __iter__(self) -> Iterator["PDFStructElement"]: - return iter(self.children) - - def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]: - """Collect all MCIDs (with their page numbers, if there are - multiple pages in the tree) inside a structure element. - """ - # Collect them depth-first to preserve ordering - for mcid in self.mcids: - yield self.page_number, mcid - d = deque(self.children) - while d: - el = d.popleft() - for mcid in el.mcids: - yield el.page_number, mcid - d.extendleft(reversed(el.children)) - - def to_dict(self) -> Dict[str, Any]: - """Return a compacted dict representation.""" - r = asdict(self) - # Prune empty values (does not matter in which order) - d = deque([r]) - while d: - el = d.popleft() - for k in list(el.keys()): - if el[k] is None or el[k] == [] or el[k] == {}: - del el[k] - if "children" in el: - d.extend(el["children"]) - return r - - -class StructTreeMissing(ValueError): - pass - - -class PDFStructTree(Findable): - """Parse the structure tree of a PDF. - - The constructor takes a `pdfplumber.PDF` and optionally a - `pdfplumber.Page`. To avoid creating the entire tree for a large - document it is recommended to provide a page. - - This class creates a representation of the portion of the - structure tree that reaches marked content sections, either for a - single page, or for the whole document. Note that this is slightly - different from the behaviour of other PDF libraries which will - also include structure elements with no content. - - If the PDF has no structure, the constructor will raise - `StructTreeMissing`. - - """ - - page: Optional["Page"] - - def __init__(self, doc: "PDF", page: Optional["Page"] = None): - self.doc = doc.doc - if "StructTreeRoot" not in self.doc.catalog: - raise StructTreeMissing("PDF has no structure") - self.root = resolve1(self.doc.catalog["StructTreeRoot"]) - self.role_map = resolve1(self.root.get("RoleMap", {})) - self.class_map = resolve1(self.root.get("ClassMap", {})) - self.children: List[PDFStructElement] = [] - - # If we have a specific page then we will work backwards from - # its ParentTree - this is because structure elements could - # span multiple pages, and the "Pg" attribute is *optional*, - # so this is the approved way to get a page's structure... - if page is not None: - self.page = page - self.pages = {page.page_number: page} - self.page_dict = None - # ...EXCEPT that the ParentTree is sometimes missing, in which - # case we fall back to the non-approved way. - parent_tree_obj = self.root.get("ParentTree") - if parent_tree_obj is None: - self._parse_struct_tree() - else: - parent_tree = NumberTree(parent_tree_obj) - # If there is no marked content in the structure tree for - # this page (which can happen even when there is a - # structure tree) then there is no `StructParents`. - # Note however that if there are XObjects in a page, - # *they* may have `StructParent` (not `StructParents`) - if "StructParents" not in self.page.page_obj.attrs: - return - parent_id = self.page.page_obj.attrs["StructParents"] - # NumberTree should have a `get` method like it does in pdf.js... - parent_array = resolve1( - next(array for num, array in parent_tree if num == parent_id) - ) - self._parse_parent_tree(parent_array) - else: - self.page = None - # Overhead of creating pages shouldn't be too bad we hope! - self.pages = {page.page_number: page for page in doc.pages} - self.page_dict = { - page.page_obj.pageid: page.page_number for page in self.pages.values() - } - self._parse_struct_tree() - - def _make_attributes( - self, obj: Dict[str, Any], revision: Optional[int] - ) -> Dict[str, Any]: - attr_obj_list = [] - for key in "C", "A": - if key not in obj: - continue - attr_obj = resolve1(obj[key]) - # It could be a list of attribute objects (why?) - if isinstance(attr_obj, list): - attr_obj_list.extend(attr_obj) - else: - attr_obj_list.append(attr_obj) - attr_objs = [] - prev_obj = None - for aref in attr_obj_list: - # If we find a revision number, which might "follow the - # revision object" (the spec is not clear about what this - # should look like but it implies they are simply adjacent - # in a flat array), then use it to decide whether to take - # the previous object... - if isinstance(aref, int): - if aref == revision and prev_obj is not None: - attr_objs.append(prev_obj) - prev_obj = None - else: - if prev_obj is not None: - attr_objs.append(prev_obj) - prev_obj = resolve1(aref) - if prev_obj is not None: - attr_objs.append(prev_obj) - # Now merge all the attribute objects in the collected to a - # single set (again, the spec doesn't really explain this but - # does say that attributes in /A supersede those in /C) - attr = {} - for obj in attr_objs: - if isinstance(obj, PSLiteral): - key = decode_text(obj.name) - if key not in self.class_map: - logger.warning("Unknown attribute class %s", key) - continue - obj = self.class_map[key] - for k, v in obj.items(): - if isinstance(v, PSLiteral): - attr[k] = decode_text(v.name) - else: - attr[k] = obj[k] - return attr - - def _make_element(self, obj: Any) -> Tuple[Optional[PDFStructElement], List[Any]]: - # We hopefully caught these earlier - assert "MCID" not in obj, "Uncaught MCR: %s" % obj - assert "Obj" not in obj, "Uncaught OBJR: %s" % obj - # Get page number if necessary - page_number = None - if self.page_dict is not None and "Pg" in obj: - page_objid = obj["Pg"].objid - assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj - page_number = self.page_dict[page_objid] - obj_tag = "" - if "S" in obj: - obj_tag = decode_text(obj["S"].name) - if obj_tag in self.role_map: - obj_tag = decode_text(self.role_map[obj_tag].name) - children = resolve1(obj["K"]) if "K" in obj else [] - if isinstance(children, int): # ugh... isinstance... - children = [children] - elif isinstance(children, dict): # a single object.. ugh... - children = [obj["K"]] - revision = obj.get("R") - attributes = self._make_attributes(obj, revision) - element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None - title = decode_text(resolve1(obj["T"])) if "T" in obj else None - lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None - alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None - actual_text = ( - decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None - ) - element = PDFStructElement( - type=obj_tag, - id=element_id, - page_number=page_number, - revision=revision, - lang=lang, - title=title, - alt_text=alt_text, - actual_text=actual_text, - attributes=attributes, - ) - return element, children - - def _parse_parent_tree(self, parent_array: List[Any]) -> None: - """Populate the structure tree using the leaves of the parent tree for - a given page.""" - # First walk backwards from the leaves to the root, tracking references - d = deque(parent_array) - s = {} - found_root = False - while d: - ref = d.popleft() - # In the case where an MCID is not associated with any - # structure, there will be a "null" in the parent tree. - if ref == KEYWORD_NULL: - continue - if repr(ref) in s: - continue - obj = resolve1(ref) - # This is required! It's in the spec! - if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot": - found_root = True - else: - # We hope that these are actual elements and not - # references or marked-content sections... - element, children = self._make_element(obj) - # We have no page tree so we assume this page was parsed - assert element is not None - s[repr(ref)] = element, children - d.append(obj["P"]) - # If we didn't reach the root something is quite wrong! - assert found_root - self._resolve_children(s) - - def on_parsed_page(self, obj: Dict[str, Any]) -> bool: - if "Pg" not in obj: - return True - page_objid = obj["Pg"].objid - if self.page_dict is not None: - return page_objid in self.page_dict - if self.page is not None: - # We have to do this to satisfy mypy - if page_objid != self.page.page_obj.pageid: - return False - return True - - def _parse_struct_tree(self) -> None: - """Populate the structure tree starting from the root, skipping - unparsed pages and empty elements.""" - root = resolve1(self.root["K"]) - - # It could just be a single object ... it's in the spec (argh) - if isinstance(root, dict): - root = [self.root["K"]] - d = deque(root) - s = {} - while d: - ref = d.popleft() - # In case the tree is actually a DAG and not a tree... - if repr(ref) in s: # pragma: nocover (shouldn't happen) - continue - obj = resolve1(ref) - # Deref top-level OBJR skipping refs to unparsed pages - if isinstance(obj, dict) and "Obj" in obj: - if not self.on_parsed_page(obj): - continue - ref = obj["Obj"] - obj = resolve1(ref) - element, children = self._make_element(obj) - # Similar to above, delay resolving the children to avoid - # tree-recursion. - s[repr(ref)] = element, children - for child in children: - obj = resolve1(child) - if isinstance(obj, dict): - if not self.on_parsed_page(obj): - continue - if "Obj" in obj: - child = obj["Obj"] - elif "MCID" in obj: - continue - if isinstance(child, PDFObjRef): - d.append(child) - - # Traverse depth-first, removing empty elements (unsure how to - # do this non-recursively) - def prune(elements: List[Any]) -> List[Any]: - next_elements = [] - for ref in elements: - obj = resolve1(ref) - if isinstance(ref, int): - next_elements.append(ref) - continue - elif isinstance(obj, dict): - if not self.on_parsed_page(obj): - continue - if "MCID" in obj: - next_elements.append(obj["MCID"]) - continue - elif "Obj" in obj: - ref = obj["Obj"] - element, children = s[repr(ref)] - children = prune(children) - # See assertions below - if element is None or not children: - del s[repr(ref)] - else: - s[repr(ref)] = element, children - next_elements.append(ref) - return next_elements - - prune(root) - self._resolve_children(s) - - def _resolve_children(self, seen: Dict[str, Any]) -> None: - """Resolve children starting from the tree root based on references we - saw when traversing the structure tree. - """ - root = resolve1(self.root["K"]) - # It could just be a single object ... it's in the spec (argh) - if isinstance(root, dict): - root = [self.root["K"]] - self.children = [] - # Create top-level self.children - parsed_root = [] - for ref in root: - obj = resolve1(ref) - if isinstance(obj, dict) and "Obj" in obj: - if not self.on_parsed_page(obj): - continue - ref = obj["Obj"] - if repr(ref) in seen: - parsed_root.append(ref) - d = deque(parsed_root) - while d: - ref = d.popleft() - element, children = seen[repr(ref)] - assert element is not None, "Unparsed element" - for child in children: - obj = resolve1(child) - if isinstance(obj, int): - element.mcids.append(obj) - elif isinstance(obj, dict): - # Skip out-of-page MCIDS and OBJRs - if not self.on_parsed_page(obj): - continue - if "MCID" in obj: - element.mcids.append(obj["MCID"]) - elif "Obj" in obj: - child = obj["Obj"] - # NOTE: if, not elif, in case of OBJR above - if isinstance(child, PDFObjRef): - child_element, _ = seen.get(repr(child), (None, None)) - if child_element is not None: - element.children.append(child_element) - d.append(child) - self.children = [seen[repr(ref)][0] for ref in parsed_root] - - def __iter__(self) -> Iterator[PDFStructElement]: - return iter(self.children) - - def element_bbox(self, el: PDFStructElement) -> T_bbox: - """Get the bounding box for an element for visual debugging.""" - page = None - if self.page is not None: - page = self.page - elif el.page_number is not None: - page = self.pages[el.page_number] - bbox = el.attributes.get("BBox", None) - if page is not None and bbox is not None: - from .page import CroppedPage, _invert_box, _normalize_box - - # Use secret knowledge of CroppedPage (cannot use - # page.height because it is the *cropped* dimension, but - # cropping does not actually translate coordinates) - bbox = _invert_box( - _normalize_box(bbox), page.mediabox[3] - page.mediabox[1] - ) - # Use more secret knowledge of CroppedPage - if isinstance(page, CroppedPage): - rect = geometry.bbox_to_rect(bbox) - rects = page._crop_fn([rect]) - if not rects: - raise IndexError("Element no longer on page") - return geometry.obj_to_bbox(rects[0]) - else: - # Not sure why mypy complains here - return bbox # type: ignore - else: - mcid_objs = [] - for page_number, mcid in el.all_mcids(): - objects: Iterable[T_obj] - if page_number is None: - if page is not None: - objects = itertools.chain.from_iterable(page.objects.values()) - else: - objects = [] # pragma: nocover - else: - objects = itertools.chain.from_iterable( - self.pages[page_number].objects.values() - ) - for c in objects: - if c["mcid"] == mcid: - mcid_objs.append(c) - if not mcid_objs: - raise IndexError("No objects found") # pragma: nocover - return geometry.objects_to_bbox(mcid_objs) From ac185d648b26662fbbb852356022d58cf1ac7bbb Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 31 Oct 2024 16:07:26 -0400 Subject: [PATCH 13/34] fix: minimally support (not quite working) new PLAYA API --- pdfplumber/convert.py | 2 +- pdfplumber/page.py | 27 +++++++++++++++------------ pdfplumber/pdf.py | 8 +++----- pdfplumber/utils/pdfinternals.py | 4 ++-- tests/test_utils.py | 4 ++-- 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index fbaf8ce8..70097dee 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -1,7 +1,7 @@ import base64 from typing import Any, Callable, Dict, List, Optional, Tuple -from playa.psparser import PSLiteral +from playa.parser import PSLiteral from .utils import decode_text diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 603f6466..a7577540 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -16,10 +16,10 @@ from warnings import warn from playa.exceptions import PDFNoStructTree -from playa.layout import LTChar, LTComponent, LTContainer, LTCurve, LTItem, LTPage -from playa.pdfpage import PDFPage -from playa.pdfstructtree import PDFStructTree -from playa.psparser import PSLiteral +from playa.layout import LTChar, LTComponent, LTCurve, LTFigure +from playa.page import Page as PDFPage +from playa.parser import PSLiteral +from playa.structtree import PDFStructTree from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list @@ -201,18 +201,20 @@ def height(self) -> T_num: @property def structure_tree(self) -> List[Dict[str, Any]]: """Return the structure tree for a page, if any.""" + try: return [ - elem.to_dict() - for elem in PDFStructTree(self.pdf.doc, [(None, self.page_obj)]) + elem.to_dict() for elem in PDFStructTree(self.pdf.doc, [self.page_obj]) ] except PDFNoStructTree: return [] @property - def layout(self) -> LTPage: - # PLAYA will cache it for us - return self.page_obj.layout + def layout(self) -> List[LTComponent]: + if hasattr(self, "_layout"): + return self._layout + self._layout = list(self.page_obj.layout) + return self._layout @property def annots(self) -> T_obj_list: @@ -295,7 +297,7 @@ def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]: # See note below re. #1181 and mediabox-adjustment reversions return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1]) - def process_object(self, obj: LTItem) -> T_obj: + def process_object(self, obj: LTComponent) -> T_obj: kind = re.sub(lt_pat, "", obj.__class__.__name__).lower() def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: @@ -330,6 +332,7 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: if self.pdf.unicode_norm is not None else text ) + # Handle (rare) byte-encoded fontnames if isinstance(attr["fontname"], bytes): attr["fontname"] = fix_fontname_bytes(attr["fontname"]) @@ -364,7 +367,7 @@ def iter_layout_objects( ) -> Generator[T_obj, None, None]: for obj in layout_objects: # If object is, like LTFigure, a higher-level object ... - if isinstance(obj, LTContainer): + if isinstance(obj, LTFigure): # Regardless, iterate through its children yield from self.iter_layout_objects(obj._objs) else: @@ -372,7 +375,7 @@ def iter_layout_objects( def parse_objects(self) -> Dict[str, T_obj_list]: objects: Dict[str, T_obj_list] = {} - for obj in self.iter_layout_objects(self.layout._objs): + for obj in self.iter_layout_objects(self.layout): kind = obj["object_type"] if kind in ["anno"]: continue diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index f03236c7..5cd8508c 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -5,9 +5,9 @@ from types import TracebackType from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union +from playa.document import PDFDocument from playa.exceptions import PDFNoStructTree, PSException -from playa.pdfdocument import PDFDocument -from playa.pdfstructtree import PDFStructTree +from playa.structtree import PDFStructTree from ._typing import T_num, T_obj_list from .container import Container @@ -177,9 +177,7 @@ def structure_tree(self) -> List[Dict[str, Any]]: if self.pages_to_parse is None: numbered_pages = None else: - numbered_pages = zip( - self.pages_to_parse, (p.page_obj for p in self.pages) - ) + numbered_pages = (p.page_obj for p in self.pages) return [elem.to_dict() for elem in PDFStructTree(self.doc, numbered_pages)] except PDFNoStructTree: return [] diff --git a/pdfplumber/utils/pdfinternals.py b/pdfplumber/utils/pdfinternals.py index 50413f10..a81b6fd1 100644 --- a/pdfplumber/utils/pdfinternals.py +++ b/pdfplumber/utils/pdfinternals.py @@ -1,7 +1,7 @@ from typing import Any, List, Optional, Union -from playa.pdftypes import PDFObjRef -from playa.psparser import PSLiteral +from playa.parser import PSLiteral +from playa.pdftypes import ObjRef as PDFObjRef from playa.utils import decode_text diff --git a/tests/test_utils.py b/tests/test_utils.py index 80402e66..c0db57f7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,8 +8,8 @@ import pandas as pd import pytest -from playa.pdfparser import PDFObjRef -from playa.psparser import PSLiteral +from playa.parser import PSLiteral +from playa.pdftypes import ObjRef as PDFObjRef import pdfplumber from pdfplumber import utils From 8d70e026686f29ba841c1aa00f49d8c3951b2acb Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sat, 16 Nov 2024 01:03:23 -0500 Subject: [PATCH 14/34] fix: some updates for latest playa --- pdfplumber/page.py | 113 ++++++++++++------------------- pdfplumber/pdf.py | 5 +- pdfplumber/utils/pdfinternals.py | 7 +- tests/test_basics.py | 10 +-- 4 files changed, 55 insertions(+), 80 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index a7577540..72cd2d83 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -1,11 +1,9 @@ -import re from functools import lru_cache from typing import ( TYPE_CHECKING, Any, Callable, Dict, - Generator, List, Optional, Pattern, @@ -15,8 +13,7 @@ from unicodedata import normalize as normalize_unicode from warnings import warn -from playa.exceptions import PDFNoStructTree -from playa.layout import LTChar, LTComponent, LTCurve, LTFigure +from playa.page import LayoutObject from playa.page import Page as PDFPage from playa.parser import PSLiteral from playa.structtree import PDFStructTree @@ -28,8 +25,6 @@ from .utils import decode_text, resolve_all, resolve_and_decode from .utils.text import TextMap -lt_pat = re.compile(r"^LT") - ALL_ATTRS = set( [ "adv", @@ -206,16 +201,9 @@ def structure_tree(self) -> List[Dict[str, Any]]: return [ elem.to_dict() for elem in PDFStructTree(self.pdf.doc, [self.page_obj]) ] - except PDFNoStructTree: + except KeyError: return [] - @property - def layout(self) -> List[LTComponent]: - if hasattr(self, "_layout"): - return self._layout - self._layout = list(self.page_obj.layout) - return self._layout - @property def annots(self) -> T_obj_list: def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]: @@ -297,88 +285,73 @@ def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]: # See note below re. #1181 and mediabox-adjustment reversions return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1]) - def process_object(self, obj: LTComponent) -> T_obj: - kind = re.sub(lt_pat, "", obj.__class__.__name__).lower() - - def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: - k, v = item + def process_object(self, layout_object: LayoutObject) -> T_obj: + kind = layout_object["object_type"] + obj: Dict[str, Any] = {"object_type": kind, "page_number": self.page_number} + for k, v in layout_object.items(): if k in ALL_ATTRS: res = resolve_all(v) - return (k, res) - else: - return None - - attr = dict(filter(None, map(process_attr, obj.__dict__.items()))) - - attr["object_type"] = kind - attr["page_number"] = self.page_number + if res is not None: + obj[k] = v - for cs in ["ncs", "scs"]: - if hasattr(obj, cs): - csobj = getattr(obj, cs) - attr[cs] = resolve_and_decode(csobj.name) + csobj = layout_object.get("ncs") + if csobj is not None: + obj["ncs"] = resolve_and_decode(csobj.name) + csobj = layout_object.get("scs") + if csobj is not None: + obj["scs"] = resolve_and_decode(csobj.name) for color_attr, pattern_attr in [ ("stroking_color", "stroking_pattern"), ("non_stroking_color", "non_stroking_pattern"), ]: - if color_attr in attr: - attr[color_attr], attr[pattern_attr] = normalize_color(attr[color_attr]) + if color_attr in obj: + obj[color_attr], obj[pattern_attr] = normalize_color(obj[color_attr]) - if isinstance(obj, LTChar): - text = obj.get_text() - attr["text"] = ( + if kind == "char": + text = layout_object["text"] + obj["text"] = ( normalize_unicode(self.pdf.unicode_norm, text) if self.pdf.unicode_norm is not None else text ) - # Handle (rare) byte-encoded fontnames - if isinstance(attr["fontname"], bytes): - attr["fontname"] = fix_fontname_bytes(attr["fontname"]) - - elif isinstance(obj, (LTCurve,)): - attr["pts"] = list(map(self.point2coord, attr["pts"])) - - # Ignoring typing because type signature for obj.original_path - # appears to be incorrect - attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501 - - attr["dash"] = obj.dashing_style + if isinstance(obj["fontname"], bytes): + obj["fontname"] = fix_fontname_bytes(obj["fontname"]) + elif obj["object_type"] == "curve": + obj["pts"] = list(map(self.point2coord, layout_object["pts"])) + obj["path"] = [ + (cmd, *map(self.point2coord, pts)) + for cmd, *pts in layout_object["path"] + ] # noqa: E501 # As noted in #1181, `pdfminer.six` adjusts objects' # coordinates relative to the MediaBox: # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84 mb_x0, mb_top = self.mediabox[:2] - if "y0" in attr: - attr["top"] = (self.height - attr["y1"]) + mb_top - attr["bottom"] = (self.height - attr["y0"]) + mb_top - attr["doctop"] = self.initial_doctop + attr["top"] - - if "x0" in attr and mb_x0 != 0: - attr["x0"] = attr["x0"] + mb_x0 - attr["x1"] = attr["x1"] + mb_x0 + if "y0" in obj: + obj["top"] = (self.height - obj["y1"]) + mb_top + obj["bottom"] = (self.height - obj["y0"]) + mb_top + obj["doctop"] = self.initial_doctop + obj["top"] - return attr + if "x0" in obj and mb_x0 != 0: + obj["x0"] = obj["x0"] + mb_x0 + obj["x1"] = obj["x1"] + mb_x0 + return obj - def iter_layout_objects( - self, layout_objects: List[LTComponent] - ) -> Generator[T_obj, None, None]: - for obj in layout_objects: - # If object is, like LTFigure, a higher-level object ... - if isinstance(obj, LTFigure): - # Regardless, iterate through its children - yield from self.iter_layout_objects(obj._objs) - else: - yield self.process_object(obj) + @property + def layout(self) -> List[LayoutObject]: + if hasattr(self, "_layout"): + return self._layout + self._layout = list(self.page_obj.layout) + return self._layout def parse_objects(self) -> Dict[str, T_obj_list]: objects: Dict[str, T_obj_list] = {} - for obj in self.iter_layout_objects(self.layout): + for layout_obj in self.layout: + obj = self.process_object(layout_obj) kind = obj["object_type"] - if kind in ["anno"]: - continue if objects.get(kind) is None: objects[kind] = [] objects[kind].append(obj) diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 5cd8508c..b8f3ee41 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -6,7 +6,6 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union from playa.document import PDFDocument -from playa.exceptions import PDFNoStructTree, PSException from playa.structtree import PDFStructTree from ._typing import T_num, T_obj_list @@ -107,7 +106,7 @@ def open( raise_unicode_errors=raise_unicode_errors, ) - except PSException: + except Exception: if not stream_is_external: stream.close() raise @@ -179,7 +178,7 @@ def structure_tree(self) -> List[Dict[str, Any]]: else: numbered_pages = (p.page_obj for p in self.pages) return [elem.to_dict() for elem in PDFStructTree(self.doc, numbered_pages)] - except PDFNoStructTree: + except KeyError: return [] def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]: diff --git a/pdfplumber/utils/pdfinternals.py b/pdfplumber/utils/pdfinternals.py index a81b6fd1..5facf676 100644 --- a/pdfplumber/utils/pdfinternals.py +++ b/pdfplumber/utils/pdfinternals.py @@ -59,8 +59,11 @@ def resolve_all(x: Any) -> Any: return x return resolve_all(resolved) - elif isinstance(x, (list, tuple)): - return type(x)(resolve_all(v) for v in x) + # FIXME: This is suboptimal for NamedTuples... + elif isinstance(x, tuple): + return tuple(resolve_all(v) for v in x) + elif isinstance(x, list): + return list(resolve_all(v) for v in x) elif isinstance(x, dict): exceptions = ["Parent"] if get_dict_type(x) == "Annot" else [] return {k: v if k in exceptions else resolve_all(v) for k, v in x.items()} diff --git a/tests/test_basics.py b/tests/test_basics.py index 48f2aa03..68bf95c4 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -46,8 +46,8 @@ def test_objects(self): # Ensure that caching is working: assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges) assert id(self.pdf_2._curve_edges) == id(self.pdf_2.curve_edges) - assert id(self.pdf.pages[0].page_obj._layout) == id( - self.pdf.pages[0].page_obj.layout + assert id(self.pdf.pages[0]._layout) == id( + self.pdf.pages[0].layout ) def test_annots(self): @@ -232,11 +232,11 @@ def test_loading_fileobj(self): def test_bad_fileobj(self): path = os.path.join(HERE, "pdfs/empty.pdf") - with pytest.raises(pdfplumber.pdf.PSException): + with pytest.raises(ValueError): pdfplumber.open(path) - f = open(path) - with pytest.raises(pdfplumber.pdf.PSException): + f = open(path) # not a binary file + with pytest.raises(TypeError): pdfplumber.open(f) # File objects passed to pdfplumber should not be auto-closed assert not f.closed From 46a8ba2bdc882418a23c3bf99330aa4360a7011c Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sat, 16 Nov 2024 09:06:52 -0500 Subject: [PATCH 15/34] fix: serialize namedtuple colors --- pdfplumber/convert.py | 10 ++++++++++ tests/test_convert.py | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index 70097dee..25439a6b 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -2,6 +2,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple from playa.parser import PSLiteral +from playa.color import ColorGray, ColorRGB, ColorCMYK from .utils import decode_text @@ -91,6 +92,15 @@ def serialize(self, obj: Any) -> Any: else: return str(obj) + def do_ColorGray(self, x: ColorGray) -> float: + return x.k + + def do_ColorRGB(self, x: ColorRGB) -> Tuple[Any, ...]: + return tuple(x) + + def do_ColorCMYK(self, x: ColorCMYK) -> Tuple[Any, ...]: + return tuple(x) + def do_float(self, x: float) -> float: return x if self.precision is None else round(x, self.precision) diff --git a/tests/test_convert.py b/tests/test_convert.py index ea9d6164..b1f41611 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -242,7 +242,8 @@ def test_cli_csv(self): ] ) - assert res.decode("utf-8").split("\r\n")[9] == ( + lines = res.decode("utf-8").split("\r\n") + assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' ',,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,,,,Y,,1,' From b37032299026d5767506a0c24f2eb09396ac5591 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sat, 16 Nov 2024 20:55:54 -0500 Subject: [PATCH 16/34] fix: adjust a few things for playa --- pdfplumber/convert.py | 19 +++++++++++++------ pdfplumber/page.py | 12 +++++------- pdfplumber/utils/pdfinternals.py | 3 --- tests/test_convert.py | 31 ++++++++++++++++++++++++------- 4 files changed, 42 insertions(+), 23 deletions(-) diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index 25439a6b..93a6018b 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -3,6 +3,7 @@ from playa.parser import PSLiteral from playa.color import ColorGray, ColorRGB, ColorCMYK +from playa.page import DashingStyle from .utils import decode_text @@ -92,14 +93,20 @@ def serialize(self, obj: Any) -> Any: else: return str(obj) - def do_ColorGray(self, x: ColorGray) -> float: - return x.k + def do_DashingStyle(self, x: DashingStyle) -> str: + if x.dash: + return f"({x.dash}, {x.phase})" + else: + return "" + + def do_ColorGray(self, x: ColorGray) -> str: + return str(x.k) - def do_ColorRGB(self, x: ColorRGB) -> Tuple[Any, ...]: - return tuple(x) + def do_ColorRGB(self, x: ColorRGB) -> str: + return f"({x.r}, {x.g}, {x.b})" - def do_ColorCMYK(self, x: ColorCMYK) -> Tuple[Any, ...]: - return tuple(x) + def do_ColorCMYK(self, x: ColorCMYK) -> str: + return f"({x.c}, {x.m}, {x.y}, {x.k})" def do_float(self, x: float) -> float: return x if self.precision is None else round(x, self.precision) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 72cd2d83..3e1e60f6 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -40,9 +40,11 @@ "y1", "bits", "matrix", + "name", "upright", "fontname", "text", + "dash", "imagemask", "colorspace", "evenodd", @@ -290,16 +292,12 @@ def process_object(self, layout_object: LayoutObject) -> T_obj: obj: Dict[str, Any] = {"object_type": kind, "page_number": self.page_number} for k, v in layout_object.items(): if k in ALL_ATTRS: - res = resolve_all(v) - if res is not None: - obj[k] = v + obj[k] = resolve_all(v) csobj = layout_object.get("ncs") - if csobj is not None: - obj["ncs"] = resolve_and_decode(csobj.name) + obj["ncs"] = None if csobj is None else resolve_and_decode(csobj.name) csobj = layout_object.get("scs") - if csobj is not None: - obj["scs"] = resolve_and_decode(csobj.name) + obj["scs"] = None if csobj is None else resolve_and_decode(csobj.name) for color_attr, pattern_attr in [ ("stroking_color", "stroking_pattern"), diff --git a/pdfplumber/utils/pdfinternals.py b/pdfplumber/utils/pdfinternals.py index 5facf676..a97b714e 100644 --- a/pdfplumber/utils/pdfinternals.py +++ b/pdfplumber/utils/pdfinternals.py @@ -59,9 +59,6 @@ def resolve_all(x: Any) -> Any: return x return resolve_all(resolved) - # FIXME: This is suboptimal for NamedTuples... - elif isinstance(x, tuple): - return tuple(resolve_all(v) for v in x) elif isinstance(x, list): return list(resolve_all(v) for v in x) elif isinstance(x, dict): diff --git a/tests/test_convert.py b/tests/test_convert.py index b1f41611..6810c825 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -178,7 +178,7 @@ def test_csv(self): assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,,,,Y,,1,' + ',,,DeviceRGB,"(0.0, 0.0, 0.0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' ) io = StringIO() @@ -241,12 +241,19 @@ def test_cli_csv(self): "3", ] ) - lines = res.decode("utf-8").split("\r\n") + assert ( + lines[0] + == "object_type,page_number,x0,x1,y0,y1,doctop,top,bottom,width,height," + "adv,bits,colorspace,contents,dash,evenodd,fill,fontname,imagemask," + "linewidth,matrix,mcid,name,ncs,non_stroking_color,non_stroking_pattern," + "path,pts,scs,size,srcsize,stream,stroke,stroking_color,stroking_pattern," + "tag,text,title,upright,uri" + ) assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,,,,Y,,1,' + ',,,DeviceRGB,"(0.0, 0.0, 0.0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -269,10 +276,18 @@ def test_cli_csv_exclude(self): ] ) - assert res.decode("utf-8").split("\r\n")[9] == ( + lines = res.decode("utf-8").split("\r\n") + assert ( + lines[0] == "object_type,page_number,x0,x1,y0,y1,doctop,top,bottom," + "width,height,adv,bits,colorspace,contents,dash,evenodd,fill," + "fontname,imagemask,linewidth,name,non_stroking_color,path," + "pts,scs,size,srcsize,stream,stroke,stroking_color,tag," + "text,title,upright,uri" + ) + assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," - "18.0,12.996,,,,,,,TimesNewRomanPSMT," - ',,"(0, 0, 0)",,,DeviceGray,18.0,,,,,,Y,,1,' + "18.0,12.996,,,,,,,TimesNewRomanPSMT,," + ',,"(0.0, 0.0, 0.0)",,,DeviceGray,18.0,,,,0,,Y,,1,' ) def test_cli_csv_include(self): @@ -291,7 +306,9 @@ def test_cli_csv_include(self): ] ) - assert res.decode("utf-8").split("\r\n")[9] == ("char,1") + lines = res.decode("utf-8").split("\r\n") + assert lines[0] == "object_type,page_number" + assert lines[9] == ("char,1") def test_page_to_dict(self): x = self.pdf.pages[0].to_dict(object_types=["char"]) From 3188fba327062d6b555520aca3c78bfc07967984 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sat, 16 Nov 2024 23:33:14 -0500 Subject: [PATCH 17/34] fix: add page numbers to structure tests --- tests/test_structure.py | 197 ++++++++++++++++++++++++++++------------ 1 file changed, 140 insertions(+), 57 deletions(-) diff --git a/tests/test_structure.py b/tests/test_structure.py index 0d64467f..784daed4 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -331,7 +331,6 @@ def teardown_class(self): self.pdf.close() def test_structure_tree(self): - assert self.pdf.pages[0].structure_tree == TREE # Add page numbers d = deque(TREE) while d: @@ -339,6 +338,7 @@ def test_structure_tree(self): el["page_number"] = 1 if "children" in el: d.extend(el["children"]) + assert self.pdf.pages[0].structure_tree == TREE assert self.pdf.structure_tree == TREE @@ -514,13 +514,13 @@ def test_structure_tree(self): { "type": "Sect", "children": [ - {"lang": "FR-CA", "type": "P", "mcids": [0]}, - {"lang": "FR-CA", "type": "P", "mcids": [1]}, - {"lang": "FR-CA", "type": "P", "mcids": [2]}, - {"lang": "FR-CA", "type": "P", "mcids": [3]}, - {"lang": "FR-CA", "type": "P", "mcids": [4]}, - {"lang": "FR-CA", "type": "P", "mcids": [5]}, - {"lang": "FR-CA", "type": "P", "mcids": [6]}, + {"type": "P", "lang": "FR-CA", "mcids": [0], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [1], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [2], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [3], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [4], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [5], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [6], "page_number": 2}, { "type": "L", "children": [ @@ -528,23 +528,29 @@ def test_structure_tree(self): "type": "LI", "children": [ { - "lang": "FR-CA", "type": "LBody", + "lang": "FR-CA", "mcids": [9, 11], "children": [ - {"lang": "FR-FR", "type": "Span", "mcids": [10]} + { + "type": "Span", + "lang": "FR-FR", + "mcids": [10], + "page_number": 2, + } ], + "page_number": 2, } ], } ], }, - {"lang": "FR-CA", "type": "P", "mcids": [14]}, - {"lang": "FR-CA", "type": "P", "mcids": [15]}, - {"lang": "FR-CA", "type": "P", "mcids": [16]}, - {"lang": "FR-FR", "type": "P", "mcids": [17]}, - {"lang": "FR-FR", "type": "P", "mcids": [18]}, - {"lang": "FR-FR", "type": "P", "mcids": [19]}, + {"type": "P", "lang": "FR-CA", "mcids": [14], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [15], "page_number": 2}, + {"type": "P", "lang": "FR-CA", "mcids": [16], "page_number": 2}, + {"type": "P", "lang": "FR-FR", "mcids": [17], "page_number": 2}, + {"type": "P", "lang": "FR-FR", "mcids": [18], "page_number": 2}, + {"type": "P", "lang": "FR-FR", "mcids": [19], "page_number": 2}, ], } ] @@ -619,28 +625,56 @@ def test_structure_tree(self): { "type": "H1", "children": [ - {"type": "Span", "mcids": [0]}, - {"type": "Span", "actual_text": " ", "mcids": [1]}, + {"type": "Span", "mcids": [0], "page_number": 1}, + { + "type": "Span", + "actual_text": " ", + "mcids": [1], + "page_number": 1, + }, ], + "page_number": 1, }, - {"type": "P", "mcids": [2]}, + {"type": "P", "mcids": [2], "page_number": 1}, { "type": "L", "attributes": {"O": "List", "ListNumbering": "Disc"}, "children": [ - {"type": "LI", "children": [{"type": "LBody", "mcids": [3]}]}, - {"type": "LI", "children": [{"type": "LBody", "mcids": [4]}]}, - {"type": "LI", "children": [{"type": "LBody", "mcids": [5]}]}, + { + "type": "LI", + "children": [{"type": "LBody", "mcids": [3], "page_number": 1}], + "page_number": 1, + }, + { + "type": "LI", + "children": [{"type": "LBody", "mcids": [4], "page_number": 1}], + "page_number": 1, + }, + { + "type": "LI", + "children": [{"type": "LBody", "mcids": [5], "page_number": 1}], + "page_number": 1, + }, ], + "page_number": 1, }, - {"type": "P", "mcids": [6]}, + {"type": "P", "mcids": [6], "page_number": 1}, { "type": "L", "attributes": {"O": "List", "ListNumbering": "Decimal"}, "children": [ - {"type": "LI", "children": [{"type": "LBody", "mcids": [7]}]}, - {"type": "LI", "children": [{"type": "LBody", "mcids": [8]}]}, + { + "type": "LI", + "children": [{"type": "LBody", "mcids": [7], "page_number": 1}], + "page_number": 1, + }, + { + "type": "LI", + "children": [{"type": "LBody", "mcids": [8], "page_number": 1}], + "page_number": 1, + }, ], + "page_number": 1, }, { "type": "Table", @@ -653,17 +687,39 @@ def test_structure_tree(self): "children": [ { "type": "TH", - "children": [{"type": "P", "mcids": [9, 10]}], + "children": [ + { + "type": "P", + "mcids": [9, 10], + "page_number": 1, + } + ], + "page_number": 1, }, { "type": "TH", - "children": [{"type": "P", "mcids": [11, 12]}], + "children": [ + { + "type": "P", + "mcids": [11, 12], + "page_number": 1, + } + ], + "page_number": 1, }, { "type": "TH", - "children": [{"type": "P", "mcids": [13, 14]}], + "children": [ + { + "type": "P", + "mcids": [13, 14], + "page_number": 1, + } + ], + "page_number": 1, }, ], + "page_number": 1, } ], }, @@ -675,40 +731,85 @@ def test_structure_tree(self): "children": [ { "type": "TD", - "children": [{"type": "P", "mcids": [15, 16]}], + "children": [ + { + "type": "P", + "mcids": [15, 16], + "page_number": 1, + } + ], + "page_number": 1, }, { "type": "TD", - "children": [{"type": "P", "mcids": [17, 18]}], + "children": [ + { + "type": "P", + "mcids": [17, 18], + "page_number": 1, + } + ], + "page_number": 1, }, { "type": "TD", - "children": [{"type": "P", "mcids": [19, 20]}], + "children": [ + { + "type": "P", + "mcids": [19, 20], + "page_number": 1, + } + ], + "page_number": 1, }, ], + "page_number": 1, }, { "type": "TR", "children": [ { "type": "TD", - "children": [{"type": "P", "mcids": [21, 22]}], + "children": [ + { + "type": "P", + "mcids": [21, 22], + "page_number": 1, + } + ], + "page_number": 1, }, { "type": "TD", - "children": [{"type": "P", "mcids": [23, 24]}], + "children": [ + { + "type": "P", + "mcids": [23, 24], + "page_number": 1, + } + ], + "page_number": 1, }, { "type": "TD", - "children": [{"type": "P", "mcids": [25, 26]}], + "children": [ + { + "type": "P", + "mcids": [25, 26], + "page_number": 1, + } + ], + "page_number": 1, }, ], + "page_number": 1, }, ], + "page_number": 1, }, ], }, - {"type": "P", "mcids": [27]}, + {"type": "P", "mcids": [27], "page_number": 1}, ], } ] @@ -830,26 +931,6 @@ def test_structure_tree(self): "mcids": [2], }, ] -HELLO1 = [ - { - "type": "Section", - "page_number": 1, - "children": [ - { - "type": "P", - "page_number": 1, - "attributes": {"O": "Foo", "A1": 1}, - "mcids": [1], - }, - ], - } -] -HELLO1P = [ - { - "type": "Section", - "children": [{"type": "P", "attributes": {"O": "Foo", "A1": 1}, "mcids": [1]}], - } -] class TestUnparsed(unittest.TestCase): @@ -895,7 +976,9 @@ def test_missing_parenttree(self): def test_image_structure(self): path = os.path.join(HERE, "pdfs/image_structure.pdf") - + # Add page numbers + for el in IMAGESTRUCT[0]["children"]: + el["page_number"] = 1 pdf = pdfplumber.open(path) page = pdf.pages[0] assert page.structure_tree == IMAGESTRUCT @@ -958,6 +1041,6 @@ def test_hello_structure(self): path = os.path.join(HERE, "pdfs/hello_structure.pdf") with pdfplumber.open(path) as pdf: assert pdf.structure_tree == HELLO - assert pdf.pages[0].structure_tree == HELLO1P + assert pdf.pages[0].structure_tree == HELLO1 with pdfplumber.open(path, pages=[1]) as pdf: assert pdf.structure_tree == HELLO1 From be99434cc9edcd960066cc5ffb835abaf868ceb7 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 17 Nov 2024 10:09:26 -0500 Subject: [PATCH 18/34] fix: updated playa names --- pdfplumber/page.py | 2 +- pdfplumber/pdf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 3e1e60f6..f467c9b9 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -16,7 +16,7 @@ from playa.page import LayoutObject from playa.page import Page as PDFPage from playa.parser import PSLiteral -from playa.structtree import PDFStructTree +from playa.structtree import StructTree as PDFStructTree from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index b8f3ee41..a7cead0d 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union from playa.document import PDFDocument -from playa.structtree import PDFStructTree +from playa.structtree import StructTree as PDFStructTree from ._typing import T_num, T_obj_list from .container import Container From 59c255b18b32528faf0577d739bc176c200043a4 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 19 Nov 2024 21:27:19 -0500 Subject: [PATCH 19/34] fix: update for PLAYA 0.1 --- pdfplumber/convert.py | 4 ++-- pdfplumber/page.py | 5 +++-- pdfplumber/pdf.py | 9 +++++---- pdfplumber/structure.py | 22 ++++++++++++++++++++++ tests/test_structure.py | 16 ++++++++++++++++ 5 files changed, 48 insertions(+), 8 deletions(-) create mode 100644 pdfplumber/structure.py diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index 93a6018b..eb781c62 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -3,7 +3,7 @@ from playa.parser import PSLiteral from playa.color import ColorGray, ColorRGB, ColorCMYK -from playa.page import DashingStyle +from playa.page import DashPattern from .utils import decode_text @@ -93,7 +93,7 @@ def serialize(self, obj: Any) -> Any: else: return str(obj) - def do_DashingStyle(self, x: DashingStyle) -> str: + def do_DashPattern(self, x: DashPattern) -> str: if x.dash: return f"({x.dash}, {x.phase})" else: diff --git a/pdfplumber/page.py b/pdfplumber/page.py index f467c9b9..22073702 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -16,11 +16,12 @@ from playa.page import LayoutObject from playa.page import Page as PDFPage from playa.parser import PSLiteral -from playa.structtree import StructTree as PDFStructTree +from playa.structtree import StructTree from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list from .container import Container +from .structure import structure_dict from .table import T_table_settings, Table, TableFinder, TableSettings from .utils import decode_text, resolve_all, resolve_and_decode from .utils.text import TextMap @@ -201,7 +202,7 @@ def structure_tree(self) -> List[Dict[str, Any]]: try: return [ - elem.to_dict() for elem in PDFStructTree(self.pdf.doc, [self.page_obj]) + structure_dict(elem) for elem in StructTree(self.pdf.doc, [self.page_obj]) ] except KeyError: return [] diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index a7cead0d..b3f55208 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -5,13 +5,14 @@ from types import TracebackType from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union -from playa.document import PDFDocument -from playa.structtree import StructTree as PDFStructTree +from playa.document import Document +from playa.structtree import StructTree from ._typing import T_num, T_obj_list from .container import Container from .page import Page from .repair import T_repair_setting, _repair +from .structure import structure_dict from .utils import resolve_and_decode logger = logging.getLogger(__name__) @@ -40,7 +41,7 @@ def __init__( self.unicode_norm = unicode_norm self.raise_unicode_errors = raise_unicode_errors - self.doc = PDFDocument(stream, password=password or "") + self.doc = Document(stream, password=password or "") self.metadata = {} for info in self.doc.info: @@ -177,7 +178,7 @@ def structure_tree(self) -> List[Dict[str, Any]]: numbered_pages = None else: numbered_pages = (p.page_obj for p in self.pages) - return [elem.to_dict() for elem in PDFStructTree(self.doc, numbered_pages)] + return [structure_dict(elem) for elem in StructTree(self.doc, numbered_pages)] except KeyError: return [] diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py new file mode 100644 index 00000000..9ec744e9 --- /dev/null +++ b/pdfplumber/structure.py @@ -0,0 +1,22 @@ +from playa.structtree import StructElement +from typing import Dict, Any +from collections import deque +from dataclasses import asdict + + +def structure_dict(top: StructElement) -> Dict[str, Any]: + """Return a compacted dict representation of PDF structure.""" + r = asdict(top) + # Prune empty values (does not matter in which order) + d = deque([r]) + while d: + el = d.popleft() + for k in list(el.keys()): + if el[k] is None or el[k] == [] or el[k] == {}: + del el[k] + if "page_idx" in el: + el["page_number"] = el["page_idx"] + 1 + del el["page_idx"] + if "children" in el: + d.extend(el["children"]) + return r diff --git a/tests/test_structure.py b/tests/test_structure.py index 784daed4..d0775f84 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -933,6 +933,22 @@ def test_structure_tree(self): ] +HELLO1 = [ + { + "type": "Section", + "page_number": 1, + "children": [ + { + "type": "P", + "page_number": 1, + "attributes": {"O": "Foo", "A1": 1}, + "mcids": [1], + }, + ], + }, +] + + class TestUnparsed(unittest.TestCase): """Test handling of PDFs with unparsed pages.""" From 9449b11246b81f93c0473350194a1b3028785216 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 19 Nov 2024 21:59:51 -0500 Subject: [PATCH 20/34] fix: lint and format and such --- pdfplumber/convert.py | 4 ++-- pdfplumber/page.py | 4 +++- pdfplumber/pdf.py | 4 +++- pdfplumber/structure.py | 5 +++-- tests/test_basics.py | 4 +--- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index eb781c62..e051d2de 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -1,9 +1,9 @@ import base64 from typing import Any, Callable, Dict, List, Optional, Tuple -from playa.parser import PSLiteral -from playa.color import ColorGray, ColorRGB, ColorCMYK +from playa.color import ColorCMYK, ColorGray, ColorRGB from playa.page import DashPattern +from playa.parser import PSLiteral from .utils import decode_text diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 22073702..e3ab10c6 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -142,6 +142,7 @@ class Page(Container): cached_properties: List[str] = Container.cached_properties + ["_layout"] is_original: bool = True pages = None + _layout: List[LayoutObject] def __init__( self, @@ -202,7 +203,8 @@ def structure_tree(self) -> List[Dict[str, Any]]: try: return [ - structure_dict(elem) for elem in StructTree(self.pdf.doc, [self.page_obj]) + structure_dict(elem) + for elem in StructTree(self.pdf.doc, [self.page_obj]) ] except KeyError: return [] diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index b3f55208..6a834b33 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -178,7 +178,9 @@ def structure_tree(self) -> List[Dict[str, Any]]: numbered_pages = None else: numbered_pages = (p.page_obj for p in self.pages) - return [structure_dict(elem) for elem in StructTree(self.doc, numbered_pages)] + return [ + structure_dict(elem) for elem in StructTree(self.doc, numbered_pages) + ] except KeyError: return [] diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py index 9ec744e9..41c046f7 100644 --- a/pdfplumber/structure.py +++ b/pdfplumber/structure.py @@ -1,7 +1,8 @@ -from playa.structtree import StructElement -from typing import Dict, Any from collections import deque from dataclasses import asdict +from typing import Any, Dict + +from playa.structtree import StructElement def structure_dict(top: StructElement) -> Dict[str, Any]: diff --git a/tests/test_basics.py b/tests/test_basics.py index 68bf95c4..8bcf5076 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -46,9 +46,7 @@ def test_objects(self): # Ensure that caching is working: assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges) assert id(self.pdf_2._curve_edges) == id(self.pdf_2.curve_edges) - assert id(self.pdf.pages[0]._layout) == id( - self.pdf.pages[0].layout - ) + assert id(self.pdf.pages[0]._layout) == id(self.pdf.pages[0].layout) def test_annots(self): pdf = self.pdf_2 From 07bfadf1e0cefe2e2c4c9a8563569c84171417eb Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 19 Nov 2024 23:40:38 -0500 Subject: [PATCH 21/34] fix(deps): playa is on pypi now --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 33edfdad..9f2e65d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -playa @ git+https://github.com/dhdaines/playa.git +playa-pdf Pillow>=9.1 pypdfium2>=4.18.0 From 4de84f64e7c6fac492063ee38cc612cb8f0c3a63 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 20 Nov 2024 00:00:51 -0500 Subject: [PATCH 22/34] fix(tests): PLAYA fixed its colors --- requirements.txt | 2 +- tests/test_convert.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9f2e65d2..dac5a3a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -playa-pdf +playa-pdf>=0.1.1 Pillow>=9.1 pypdfium2>=4.18.0 diff --git a/tests/test_convert.py b/tests/test_convert.py index 6810c825..da295843 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -178,7 +178,7 @@ def test_csv(self): assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0.0, 0.0, 0.0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' + ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' ) io = StringIO() @@ -253,7 +253,7 @@ def test_cli_csv(self): assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0.0, 0.0, 0.0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' + ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -287,7 +287,7 @@ def test_cli_csv_exclude(self): assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,," - ',,"(0.0, 0.0, 0.0)",,,DeviceGray,18.0,,,,0,,Y,,1,' + ',,"(0, 0, 0)",,,DeviceGray,18.0,,,,0,,Y,,1,' ) def test_cli_csv_include(self): From cdd3895697c7508636ddf58cf7df8c9768e86efc Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 20 Nov 2024 00:21:45 -0500 Subject: [PATCH 23/34] fix(deps): messed up playa again... --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index dac5a3a2..3e613935 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -playa-pdf>=0.1.1 +playa-pdf>=0.1.2 Pillow>=9.1 pypdfium2>=4.18.0 From d27fcb9d64cdaedca94b284abcf31ed601ea47dc Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 22 Nov 2024 15:21:23 -0500 Subject: [PATCH 24/34] fix(tests): back to previous way of formatting colors (for now) --- pdfplumber/convert.py | 16 ---------------- tests/test_convert.py | 6 +++--- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index e051d2de..98c840d3 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -1,7 +1,6 @@ import base64 from typing import Any, Callable, Dict, List, Optional, Tuple -from playa.color import ColorCMYK, ColorGray, ColorRGB from playa.page import DashPattern from playa.parser import PSLiteral @@ -93,21 +92,6 @@ def serialize(self, obj: Any) -> Any: else: return str(obj) - def do_DashPattern(self, x: DashPattern) -> str: - if x.dash: - return f"({x.dash}, {x.phase})" - else: - return "" - - def do_ColorGray(self, x: ColorGray) -> str: - return str(x.k) - - def do_ColorRGB(self, x: ColorRGB) -> str: - return f"({x.r}, {x.g}, {x.b})" - - def do_ColorCMYK(self, x: ColorCMYK) -> str: - return f"({x.c}, {x.m}, {x.y}, {x.k})" - def do_float(self, x: float) -> float: return x if self.precision is None else round(x, self.precision) diff --git a/tests/test_convert.py b/tests/test_convert.py index da295843..fafea2f2 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -178,7 +178,7 @@ def test_csv(self): assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' + ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) io = StringIO() @@ -253,7 +253,7 @@ def test_cli_csv(self): assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,0,,,Y,,1,' + ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -287,7 +287,7 @@ def test_cli_csv_exclude(self): assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,," - ',,"(0, 0, 0)",,,DeviceGray,18.0,,,,0,,Y,,1,' + ',,"(0, 0, 0)",,,DeviceGray,18.0,,,,"(0,)",,Y,,1,' ) def test_cli_csv_include(self): From 7e7d3542e1c0fe1687d27e06ceb2e19413da2875 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 22 Nov 2024 15:21:38 -0500 Subject: [PATCH 25/34] fix: no longer needs repair as mediabox is normalized --- tests/test_repair.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/test_repair.py b/tests/test_repair.py index dbeb7932..e69c290f 100644 --- a/tests/test_repair.py +++ b/tests/test_repair.py @@ -13,22 +13,12 @@ class Test(unittest.TestCase): def test_from_issue_932(self): + # No longer malformed! path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf") with pdfplumber.open(path) as pdf: page = pdf.pages[0] char = page.chars[0] - assert char["bottom"] > page.height - - with pdfplumber.open(path, repair=True) as pdf: - page = pdf.pages[0] - char = page.chars[0] - assert char["bottom"] < page.height - - with pdfplumber.repair(path) as repaired: - with pdfplumber.open(repaired) as pdf: - page = pdf.pages[0] - char = page.chars[0] - assert char["bottom"] < page.height + assert char["bottom"] <= page.height def test_other_repair_inputs(self): path = os.path.join(HERE, "pdfs/malformed-from-issue-932.pdf") From b3da221cbc1197eefe2bf583716b1192c1b05628 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 22 Nov 2024 15:24:25 -0500 Subject: [PATCH 26/34] fix: remove unused import --- pdfplumber/convert.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py index 98c840d3..70097dee 100644 --- a/pdfplumber/convert.py +++ b/pdfplumber/convert.py @@ -1,7 +1,6 @@ import base64 from typing import Any, Callable, Dict, List, Optional, Tuple -from playa.page import DashPattern from playa.parser import PSLiteral from .utils import decode_text From b0b7e6cb841338a22cd1ea89f1ee4329fe2bcd44 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 12 Dec 2024 16:37:15 -0500 Subject: [PATCH 27/34] feat: mostly implement using lazy api --- pdfplumber/page.py | 219 ++++++++++++++++++++++++++++--------------- pdfplumber/pdf.py | 2 +- requirements.txt | 2 +- tests/test_basics.py | 2 +- 4 files changed, 146 insertions(+), 79 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index e3ab10c6..f4f634e1 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -4,6 +4,8 @@ Any, Callable, Dict, + Iterator, + Iterable, List, Optional, Pattern, @@ -13,8 +15,15 @@ from unicodedata import normalize as normalize_unicode from warnings import warn -from playa.page import LayoutObject -from playa.page import Page as PDFPage +from playa.page import ( + Page as PDFPage, + ContentObject, + GlyphObject, + TextObject, + PathObject, + PathSegment, +) +from playa.utils import mult_matrix, translate_matrix from playa.parser import PSLiteral from playa.structtree import StructTree @@ -87,29 +96,6 @@ def fix_fontname_bytes(fontname: bytes) -> str: return str(prefix)[2:-1] + suffix_new -def separate_pattern( - color: Tuple[Any, ...] -) -> Tuple[Optional[Tuple[Union[float, int], ...]], Optional[str]]: - if isinstance(color[-1], PSLiteral): - return (color[:-1] or None), decode_text(color[-1].name) - else: - return color, None - - -def normalize_color( - color: Any, -) -> Tuple[Optional[Tuple[Union[float, int], ...]], Optional[str]]: - if color is None: - return (None, None) - elif isinstance(color, tuple): - tuplefied = color - elif isinstance(color, list): - tuplefied = tuple(color) - else: - tuplefied = (color,) - return separate_pattern(tuplefied) - - def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: return { key: (tuple(value) if isinstance(value, list) else value) @@ -122,6 +108,8 @@ def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox: # conventionally specified by their lower-left and upperright # corners, it is acceptable to specify any two diagonally opposite # corners." + # TODO: PLAYA mostly does this for us but need to check if that is + # still the case when rotation is applied x0, x1 = sorted((box_raw[0], box_raw[2])) y0, y1 = sorted((box_raw[1], box_raw[3])) if rotation in [90, 270]: @@ -138,11 +126,61 @@ def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox: return (x0, mb_height - y1, x1, mb_height - y0) +def flatten_contents(objs: Iterable[ContentObject]) -> Iterator[ContentObject]: + """Traverse a PDF page, recursing into text, path, and xobjects.""" + # This was maybe not such a great design decision - all + # ContentObjects are iterable, but only some of them actually + # contain other objects. No way to know this in advance; PLAYA + # should put a __len__ method but also an empty() method on them + count = 0 + for obj in objs: + yield from flatten_contents(obj) + count += 1 + if count == 0: + yield objs + + +def is_closed(segs: List[PathSegment]) -> bool: + """Detect a closed shape.""" + if segs[-1].operator == "h": # Easy, it tells us it's closed! + return True + if segs[-1].points[-1] == segs[0].points[-1]: + return True + return False + + +def is_rectangular(segs: List[PathSegment]) -> bool: + """Detect a rectangular shape. + + TODO: Rectangular here is defined in device space, what about + rotated rectangles? Are they not rectangles too? If you prick + yourself on them, do you not bleed? + """ + # A rectangle has four sides (exception for a redundant 'h') + if len(segs) > 5 or (len(segs) == 5 and segs[4].operator != "h"): + return False + xs = [] + ys = [] + # TODO: This could be done with fancy list comprehensions which + # might be slightly faster but much less easy to understand. + for seg in segs[:4]: + if seg.operator == "h": + x, y = segs[0].points[-1] + else: + x, y = seg.points[-1] + xs.append(x) + ys.append(y) + if xs[0] == xs[1] and ys[1] == ys[2] and xs[2] == xs[3] and ys[3] == ys[0]: + return True + if ys[0] == ys[1] and xs[1] == xs[2] and ys[2] == ys[3] and xs[3] == xs[0]: + return True + return False + + class Page(Container): - cached_properties: List[str] = Container.cached_properties + ["_layout"] + cached_properties: List[str] = Container.cached_properties is_original: bool = True pages = None - _layout: List[LayoutObject] def __init__( self, @@ -290,68 +328,97 @@ def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]: # See note below re. #1181 and mediabox-adjustment reversions return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1]) - def process_object(self, layout_object: LayoutObject) -> T_obj: - kind = layout_object["object_type"] - obj: Dict[str, Any] = {"object_type": kind, "page_number": self.page_number} - for k, v in layout_object.items(): - if k in ALL_ATTRS: - obj[k] = resolve_all(v) - - csobj = layout_object.get("ncs") - obj["ncs"] = None if csobj is None else resolve_and_decode(csobj.name) - csobj = layout_object.get("scs") - obj["scs"] = None if csobj is None else resolve_and_decode(csobj.name) - - for color_attr, pattern_attr in [ - ("stroking_color", "stroking_pattern"), - ("non_stroking_color", "non_stroking_pattern"), - ]: - if color_attr in obj: - obj[color_attr], obj[pattern_attr] = normalize_color(obj[color_attr]) - - if kind == "char": - text = layout_object["text"] + def process_object(self, content_object: ContentObject) -> T_obj: + kind = content_object.object_type + obj: T_obj = {"object_type": kind, "page_number": self.page_number} + + gstate = content_object.gstate + # These cannot be None, but they might be the defaults + obj["ncs"] = resolve_and_decode(gstate.ncs.name) + obj["scs"] = resolve_and_decode(gstate.scs.name) + obj["stroking_color"] = gstate.scolor.values + obj["stroking_pattern"] = gstate.scolor.pattern + obj["non_stroking_color"] = gstate.ncolor.values + obj["non_stroking_pattern"] = gstate.ncolor.pattern + + # As noted in #1181, `pdfminer.six` (and `playa` by default) adjust objects' + # coordinates relative to the MediaBox: + # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84 + mb_x0, mb_top = self.mediabox[:2] + + try: + x0, y0, x1, y1 = content_object.bbox + obj["x0"] = x0 + mb_x0 + obj["x1"] = x1 + mb_x0 + obj["y0"] = y0 # FIXME: but... what about the MediaBox? + obj["y1"] = y1 + obj["top"] = (self.height - y1) + mb_top + obj["bottom"] = (self.height - y0) + mb_top + obj["doctop"] = self.initial_doctop + obj["top"] + obj["height"] = abs(y1 - y0) + obj["width"] = abs(x1 - x0) + except ValueError: + # It's an object with no bbox (e.g. a marked content point) + pass + + if isinstance(content_object, GlyphObject): + text = content_object.text + obj["object_type"] = "char" + obj["cid"] = content_object.cid + obj["adv"] = content_object.adv + textstate = content_object.textstate + obj["fontname"] = textstate.font.fontname obj["text"] = ( normalize_unicode(self.pdf.unicode_norm, text) - if self.pdf.unicode_norm is not None + if text and self.pdf.unicode_norm is not None else text ) + # Lazy API does not do this stuff for you + # NOTE: This is not right at all for rotated text, but we'll live with it + if textstate.font.vertical: + obj["size"] = obj["width"] + else: + obj["size"] = obj["height"] + matrix = mult_matrix(textstate.line_matrix, content_object.ctm) + matrix = translate_matrix(matrix, textstate.glyph_offset) + obj["matrix"] = matrix + (a, b, c, d, e, f) = matrix + scaling = textstate.scaling * 0.01 # FIXME: unnecessary + obj["upright"] = a * d * scaling > 0 and b * c <= 0 # Handle (rare) byte-encoded fontnames if isinstance(obj["fontname"], bytes): obj["fontname"] = fix_fontname_bytes(obj["fontname"]) - elif obj["object_type"] == "curve": - obj["pts"] = list(map(self.point2coord, layout_object["pts"])) + elif isinstance(content_object, PathObject): + segments = list(content_object.segments) + # Have to do "rect" and "line" detection ourselves, PLAYA + # Does Not Do Heuristics + shape = "".join(seg.operator for seg in segments) + if shape in ("mlh", "ml"): + obj["object_type"] = "line" + elif ( + shape in ("mlllh", "mllll") + and is_closed(segments) + and is_rectangular(segments) + ): + obj["object_type"] = "rect" + else: + obj["object_type"] = "curve" + + obj["pts"] = [ + self.point2coord(seg.points[-1]) for seg in segments if seg.points + ] obj["path"] = [ - (cmd, *map(self.point2coord, pts)) - for cmd, *pts in layout_object["path"] - ] # noqa: E501 - - # As noted in #1181, `pdfminer.six` adjusts objects' - # coordinates relative to the MediaBox: - # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84 - mb_x0, mb_top = self.mediabox[:2] - - if "y0" in obj: - obj["top"] = (self.height - obj["y1"]) + mb_top - obj["bottom"] = (self.height - obj["y0"]) + mb_top - obj["doctop"] = self.initial_doctop + obj["top"] + (seg.operator, [self.point2coord(pt) for pt in seg.points]) + for seg in segments + ] - if "x0" in obj and mb_x0 != 0: - obj["x0"] = obj["x0"] + mb_x0 - obj["x1"] = obj["x1"] + mb_x0 return obj - @property - def layout(self) -> List[LayoutObject]: - if hasattr(self, "_layout"): - return self._layout - self._layout = list(self.page_obj.layout) - return self._layout - def parse_objects(self) -> Dict[str, T_obj_list]: objects: Dict[str, T_obj_list] = {} - for layout_obj in self.layout: - obj = self.process_object(layout_obj) + # TODO: flatten_contents should go in PLAYA + for content_object in flatten_contents(self.page_obj): + obj = self.process_object(content_object) kind = obj["object_type"] if objects.get(kind) is None: objects[kind] = [] diff --git a/pdfplumber/pdf.py b/pdfplumber/pdf.py index 6a834b33..f4c01712 100644 --- a/pdfplumber/pdf.py +++ b/pdfplumber/pdf.py @@ -41,7 +41,7 @@ def __init__( self.unicode_norm = unicode_norm self.raise_unicode_errors = raise_unicode_errors - self.doc = Document(stream, password=password or "") + self.doc = Document(stream, password=password or "", space="page") self.metadata = {} for info in self.doc.info: diff --git a/requirements.txt b/requirements.txt index 3e613935..b7464f80 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -playa-pdf>=0.1.2 +playa-pdf[crypto]>=0.2.5 Pillow>=9.1 pypdfium2>=4.18.0 diff --git a/tests/test_basics.py b/tests/test_basics.py index 8bcf5076..edfc569d 100644 --- a/tests/test_basics.py +++ b/tests/test_basics.py @@ -46,7 +46,7 @@ def test_objects(self): # Ensure that caching is working: assert id(self.pdf._rect_edges) == id(self.pdf.rect_edges) assert id(self.pdf_2._curve_edges) == id(self.pdf_2.curve_edges) - assert id(self.pdf.pages[0]._layout) == id(self.pdf.pages[0].layout) + assert id(self.pdf.pages[0]._objects) == id(self.pdf.pages[0].objects) def test_annots(self): pdf = self.pdf_2 From 6de85ede7e86889d34e910894293cc4dbb203aba Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 13 Dec 2024 00:22:43 -0500 Subject: [PATCH 28/34] feat: complete the reimplementation using playa lazy api --- pdfplumber/page.py | 77 ++++++++++++++++++++++++++----------------- tests/test_convert.py | 12 ++++--- 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index f4f634e1..b8be45bb 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -5,7 +5,6 @@ Callable, Dict, Iterator, - Iterable, List, Optional, Pattern, @@ -15,24 +14,18 @@ from unicodedata import normalize as normalize_unicode from warnings import warn -from playa.page import ( - Page as PDFPage, - ContentObject, - GlyphObject, - TextObject, - PathObject, - PathSegment, -) -from playa.utils import mult_matrix, translate_matrix -from playa.parser import PSLiteral +from playa.page import ContentObject, GlyphObject, ImageObject +from playa.page import Page as PDFPage +from playa.page import PathObject, PathSegment from playa.structtree import StructTree +from playa.utils import mult_matrix, translate_matrix from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list from .container import Container from .structure import structure_dict from .table import T_table_settings, Table, TableFinder, TableSettings -from .utils import decode_text, resolve_all, resolve_and_decode +from .utils import resolve_all, resolve_and_decode from .utils.text import TextMap ALL_ATTRS = set( @@ -126,18 +119,23 @@ def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox: return (x0, mb_height - y1, x1, mb_height - y0) -def flatten_contents(objs: Iterable[ContentObject]) -> Iterator[ContentObject]: +def flatten_contents(objs: Union[PDFPage, ContentObject]) -> Iterator[ContentObject]: """Traverse a PDF page, recursing into text, path, and xobjects.""" - # This was maybe not such a great design decision - all - # ContentObjects are iterable, but only some of them actually - # contain other objects. No way to know this in advance; PLAYA - # should put a __len__ method but also an empty() method on them - count = 0 - for obj in objs: - yield from flatten_contents(obj) - count += 1 - if count == 0: - yield objs + if isinstance(objs, PathObject): + # PathObjects are a bit special since they always contain at + # least one subpath, and these are a flat list (do not recurse) + yield from objs + else: + # Other ContentObjects are iterable and possibly multiply + # nested (in the case of XObjects), but only some of them + # actually contain other objects. We *could* check the length + # of an object first but this is wasteful, so we don't. + count = 0 + for obj in objs: + yield from flatten_contents(obj) + count += 1 + if count == 0 and not isinstance(objs, PDFPage): + yield objs def is_closed(segs: List[PathSegment]) -> bool: @@ -332,6 +330,13 @@ def process_object(self, content_object: ContentObject) -> T_obj: kind = content_object.object_type obj: T_obj = {"object_type": kind, "page_number": self.page_number} + if content_object.mcs is not None: + obj["mcid"] = content_object.mcs.mcid + obj["tag"] = content_object.mcs.tag + else: + obj["mcid"] = None + obj["tag"] = None + gstate = content_object.gstate # These cannot be None, but they might be the defaults obj["ncs"] = resolve_and_decode(gstate.ncs.name) @@ -340,6 +345,8 @@ def process_object(self, content_object: ContentObject) -> T_obj: obj["stroking_pattern"] = gstate.scolor.pattern obj["non_stroking_color"] = gstate.ncolor.values obj["non_stroking_pattern"] = gstate.ncolor.pattern + obj["dash"] = tuple(gstate.dash) if gstate.dash.dash else None + obj["linewidth"] = gstate.linewidth # As noted in #1181, `pdfminer.six` (and `playa` by default) adjust objects' # coordinates relative to the MediaBox: @@ -364,10 +371,8 @@ def process_object(self, content_object: ContentObject) -> T_obj: if isinstance(content_object, GlyphObject): text = content_object.text obj["object_type"] = "char" - obj["cid"] = content_object.cid obj["adv"] = content_object.adv textstate = content_object.textstate - obj["fontname"] = textstate.font.fontname obj["text"] = ( normalize_unicode(self.pdf.unicode_norm, text) if text and self.pdf.unicode_norm is not None @@ -375,15 +380,17 @@ def process_object(self, content_object: ContentObject) -> T_obj: ) # Lazy API does not do this stuff for you # NOTE: This is not right at all for rotated text, but we'll live with it - if textstate.font.vertical: - obj["size"] = obj["width"] - else: - obj["size"] = obj["height"] + if textstate.font is not None: + obj["fontname"] = textstate.font.fontname + if textstate.font.vertical: + obj["size"] = obj["width"] + else: + obj["size"] = obj["height"] matrix = mult_matrix(textstate.line_matrix, content_object.ctm) matrix = translate_matrix(matrix, textstate.glyph_offset) obj["matrix"] = matrix (a, b, c, d, e, f) = matrix - scaling = textstate.scaling * 0.01 # FIXME: unnecessary + scaling = textstate.scaling * 0.01 # FIXME: unnecessary? obj["upright"] = a * d * scaling > 0 and b * c <= 0 # Handle (rare) byte-encoded fontnames if isinstance(obj["fontname"], bytes): @@ -411,6 +418,16 @@ def process_object(self, content_object: ContentObject) -> T_obj: (seg.operator, [self.point2coord(pt) for pt in seg.points]) for seg in segments ] + obj["evenodd"] = content_object.evenodd + obj["stroke"] = content_object.stroke + obj["fill"] = content_object.fill + elif isinstance(content_object, ImageObject): + obj["colorspace"] = content_object.colorspace + obj["imagemask"] = content_object.imagemask + obj["stream"] = content_object.stream + obj["srcsize"] = content_object.srcsize + obj["bits"] = content_object.bits + obj["name"] = content_object.xobjid return obj diff --git a/tests/test_convert.py b/tests/test_convert.py index fafea2f2..a2c45b93 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -177,8 +177,9 @@ def test_csv(self): c = self.pdf.to_csv(precision=3) assert c.split("\r\n")[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," - '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' + "18.0,12.996,,,,,,,TimesNewRomanPSMT,,0," + '"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"' + ',,,DeviceGray,"(0,)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) io = StringIO() @@ -252,8 +253,9 @@ def test_cli_csv(self): ) assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," - '18.0,12.996,,,,,,,TimesNewRomanPSMT,,,"(1, 0, 0, 1, 45.83, 660.69)"' - ',,,DeviceRGB,"(0, 0, 0)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' + "18.0,12.996,,,,,,,TimesNewRomanPSMT,,0," + '"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"' + ',,,DeviceGray,"(0,)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -287,7 +289,7 @@ def test_cli_csv_exclude(self): assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,," - ',,"(0, 0, 0)",,,DeviceGray,18.0,,,,"(0,)",,Y,,1,' + '0,,"(0,)",,,DeviceGray,18.0,,,,"(0,)",,Y,,1,' ) def test_cli_csv_include(self): From 56bcab6c447f0a5a18c601453bb8d1894f14b094 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 13 Dec 2024 09:05:18 -0500 Subject: [PATCH 29/34] feat: lightly wrap playa structure --- pdfplumber/structure.py | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py index 41c046f7..9612f748 100644 --- a/pdfplumber/structure.py +++ b/pdfplumber/structure.py @@ -1,8 +1,39 @@ from collections import deque -from dataclasses import asdict -from typing import Any, Dict +from dataclasses import asdict, dataclass +from typing import Any, Dict, Iterator, Optional, TYPE_CHECKING -from playa.structtree import StructElement +from playa.structtree import StructElement, StructTree + +if TYPE_CHECKING: # pragma: nocover + from .pdf import PDF + from .page import Page + + +class StructTreeMissing(ValueError): + pass + + +class PDFStructElement(StructElement): + """PDF Logical Structure Element""" + + # When PLAYA changes its API, wrap it here... + + +class PDFStructTree(StructTree): + """PDF Logical Structure Tree""" + + def __init__(self, doc: "PDF", page: Optional["Page"] = None): + self.doc = doc.doc + try: + if page is not None: + pages = [page.page_obj] + else: + pages = None + super().__init__(self.doc, pages) + except KeyError: + raise StructTreeMissing("PDF has no structure") + + # May wish to wrap __iter__, find, find_all in the future def structure_dict(top: StructElement) -> Dict[str, Any]: From 4117380e998995142b363616100cdb6e4a11b9c7 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 13 Dec 2024 09:24:37 -0500 Subject: [PATCH 30/34] fix: lint --- pdfplumber/structure.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdfplumber/structure.py b/pdfplumber/structure.py index 9612f748..c1702e89 100644 --- a/pdfplumber/structure.py +++ b/pdfplumber/structure.py @@ -1,12 +1,12 @@ from collections import deque -from dataclasses import asdict, dataclass -from typing import Any, Dict, Iterator, Optional, TYPE_CHECKING +from dataclasses import asdict +from typing import TYPE_CHECKING, Any, Dict, Optional from playa.structtree import StructElement, StructTree if TYPE_CHECKING: # pragma: nocover - from .pdf import PDF from .page import Page + from .pdf import PDF class StructTreeMissing(ValueError): From 718a558cf0f6e5d6d651d699db47e4710b86923a Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 15 Dec 2024 13:28:21 -0500 Subject: [PATCH 31/34] docs: update README and CHANGELOG --- CHANGELOG.md | 7 +++++++ README.md | 22 ++++++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38f1b12f..82b62d7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). +## Unreleased + +### Changed + +- Switch to using [`PLAYA-PDF`](https://github.com/dhdaines/playa) for PDF parsing for increased speed and robustness. +- Remove pdfminer-specific interfaces (chiefly `LAParams`) + ## [0.11.5] - 2024-10-02 ### Added diff --git a/README.md b/README.md index e8199ea5..c189af04 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging. -Works best on machine-generated, rather than scanned, PDFs. Built on [`pdfminer.six`](https://github.com/goulu/pdfminer). +Works best on machine-generated, rather than scanned, PDFs. Built on [`PLAYA-PDF`](https://github.com/dhdaines/playa). Currently [tested](tests/) on [Python 3.8, 3.9, 3.10, 3.11](.github/workflows/tests.yml). @@ -50,7 +50,6 @@ The output will be a CSV containing info about every character, line, and rectan |`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.| |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.| |`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`, et cetera. Defaults to all available.| -|`--laparams`| A JSON-formatted string (e.g., `'{"detect_vertical": true}'`) to pass to `pdfplumber.open(..., laparams=...)`.| |`--precision [integer]`| The number of decimal places to round floating-point numbers. Defaults to no rounding.| ## Python library @@ -77,8 +76,6 @@ The `open` method returns an instance of the `pdfplumber.PDF` class. To load a password-protected PDF, pass the `password` keyword argument, e.g., `pdfplumber.open("file.pdf", password = "test")`. -To set layout analysis parameters to `pdfminer.six`'s layout engine, pass the `laparams` keyword argument, e.g., `pdfplumber.open("file.pdf", laparams = { "line_overlap": 0.7 })`. - To [pre-normalize Unicode text](https://unicode.org/reports/tr15/), pass `unicode_norm=...`, where `...` is one of the [four Unicode normalization forms](https://unicode.org/reports/tr15/#Normalization_Forms_Table): `"NFC"`, `"NFD"`, `"NFKC"`, or `"NFKD"`. Invalid metadata values are treated as a warning by default. If that is not intended, pass `strict_metadata=True` to the `open` method and `pdfplumber.open` will raise an exception if it is unable to parse the metadata. @@ -132,12 +129,12 @@ Additional methods are described in the sections below: ### Objects -Each instance of `pdfplumber.PDF` and `pdfplumber.Page` provides access to several types of PDF objects, all derived from [`pdfminer.six`](https://github.com/pdfminer/pdfminer.six/) PDF parsing. The following properties each return a Python list of the matching objects: +Each instance of `pdfplumber.PDF` and `pdfplumber.Page` provides access to several types of PDF objects, all derived from [`PLAYA-PDF`](https://github.com/dhdaines/playa/) PDF parsing. The following properties each return a Python list of the matching objects: - `.chars`, each representing a single text character. - `.lines`, each representing a single 1-dimensional line. - `.rects`, each representing a single 2-dimensional rectangle. -- `.curves`, each representing any series of connected points that `pdfminer.six` does not recognize as a line or rectangle. +- `.curves`, each representing any series of connected points that `pdfplumber` does not recognize as a line or rectangle. - `.images`, each representing an image. - `.annots`, each representing a single PDF annotation (cf. Section 8.4 of the [official PDF specification](https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf) for details) - `.hyperlinks`, each representing a single PDF annotation of the subtype `Link` and having an `URI` action attribute @@ -272,18 +269,13 @@ Additionally, both `pdfplumber.PDF` and `pdfplumber.Page` provide access to seve |`srcsize`| The image original dimensions, as a `(width, height)` tuple.| |`colorspace`| Color domain of the image (e.g., RGB).| |`bits`| The number of bits per color component; e.g., 8 corresponds to 255 possible values for each color component (R, G, and B in an RGB color space).| -|`stream`| Pixel values of the image, as a `pdfminer.pdftypes.PDFStream` object.| +|`stream`| Pixel values of the image, as a `playa.pdftypes.ContentStream` object.| |`imagemask`| A nullable boolean; if `True`, "specifies that the image data is to be used as a stencil mask for painting in the current color."| |`name`| "The name by which this image XObject is referenced in the XObject subdictionary of the current resource dictionary." [🔗](https://ghostscript.com/~robin/pdf_reference17.pdf#page=340) | |`mcid`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section ID for this image if any (otherwise `None`). *Experimental attribute.*| |`tag`| The [marked content](https://ghostscript.com/~robin/pdf_reference17.pdf#page=850) section tag for this image if any (otherwise `None`). *Experimental attribute.*| |`object_type`| "image"| -### Obtaining higher-level layout objects via `pdfminer.six` - -If you pass the `pdfminer.six`-handling `laparams` parameter to `pdfplumber.open(...)`, then each page's `.objects` dictionary will also contain `pdfminer.six`'s higher-level layout objects, such as `"textboxhorizontal"`. - - ## Visual debugging `pdfplumber`'s visual debugging tools can be helpful in understanding the structure of a PDF and the objects that have been extracted from it. @@ -451,7 +443,7 @@ Both `vertical_strategy` and `horizontal_strategy` accept the following options: Sometimes PDF files can contain forms that include inputs that people can fill out and save. While values in form fields appear like other text in a PDF file, form data is handled differently. If you want the gory details, see page 671 of this [specification](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.7old.pdf). -`pdfplumber` doesn't have an interface for working with form data, but you can access it using `pdfplumber`'s wrappers around `pdfminer`. +`pdfplumber` doesn't have an interface for working with form data, but you can access it using `pdfplumber`'s wrappers around `PLAYA-PDF`. For example, this snippet will retrieve form field names and values and store them in a dictionary. @@ -523,7 +515,9 @@ It's also helpful to know what features `pdfplumber` does __not__ provide: ### Specific comparisons -- [`pdfminer.six`](https://github.com/pdfminer/pdfminer.six) provides the foundation for `pdfplumber`. It primarily focuses on parsing PDFs, analyzing PDF layouts and object positioning, and extracting text. It does not provide tools for table extraction or visual debugging. +- [`PLAYA-PDF`](https://github.com/dhdaines/playa) provides the foundation for `pdfplumber`. It focuses on parsing PDFs and does not do layout analysis or text extraction. + +- [`pdfminer.six`](https://github.com/pdfminer/pdfminer.six) focuses on parsing PDFs, with some functionality for analyzing PDF layouts and object positioning, and extracting text. It does not provide tools for table extraction or visual debugging. - [`PyPDF2`](https://github.com/mstamy2/PyPDF2) is a pure-Python library "capable of splitting, merging, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files." It can extract page text, but does not provide easy access to shape objects (rectangles, lines, etc.), table-extraction, or visually debugging tools. From 0e6dc30e75ba84232123a87aded50c62a3d7fdbc Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Fri, 27 Dec 2024 12:31:32 -0500 Subject: [PATCH 32/34] feat: expose render_mode (fixes: #1230) --- pdfplumber/page.py | 2 ++ tests/test_convert.py | 11 ++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index b8be45bb..54c45eff 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -47,6 +47,7 @@ "upright", "fontname", "text", + "render_mode", "dash", "imagemask", "colorspace", @@ -378,6 +379,7 @@ def process_object(self, content_object: ContentObject) -> T_obj: if text and self.pdf.unicode_norm is not None else text ) + obj["render_mode"] = textstate.render_mode # Lazy API does not do this stuff for you # NOTE: This is not right at all for rotated text, but we'll live with it if textstate.font is not None: diff --git a/tests/test_convert.py b/tests/test_convert.py index a2c45b93..2e595aca 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -179,7 +179,7 @@ def test_csv(self): "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,,0," '"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"' - ',,,DeviceGray,"(0,)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' + ',,,DeviceGray,"(0,)",,,,0,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) io = StringIO() @@ -248,14 +248,15 @@ def test_cli_csv(self): == "object_type,page_number,x0,x1,y0,y1,doctop,top,bottom,width,height," "adv,bits,colorspace,contents,dash,evenodd,fill,fontname,imagemask," "linewidth,matrix,mcid,name,ncs,non_stroking_color,non_stroking_pattern," - "path,pts,scs,size,srcsize,stream,stroke,stroking_color,stroking_pattern," + "path,pts,render_mode,scs,size,srcsize,stream,stroke,stroking_color," + "stroking_pattern," "tag,text,title,upright,uri" ) assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,,0," '"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"' - ',,,DeviceGray,"(0,)",,,,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' + ',,,DeviceGray,"(0,)",,,,0,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -283,13 +284,13 @@ def test_cli_csv_exclude(self): lines[0] == "object_type,page_number,x0,x1,y0,y1,doctop,top,bottom," "width,height,adv,bits,colorspace,contents,dash,evenodd,fill," "fontname,imagemask,linewidth,name,non_stroking_color,path," - "pts,scs,size,srcsize,stream,stroke,stroking_color,tag," + "pts,render_mode,scs,size,srcsize,stream,stroke,stroking_color,tag," "text,title,upright,uri" ) assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,," - '0,,"(0,)",,,DeviceGray,18.0,,,,"(0,)",,Y,,1,' + '0,,"(0,)",,,0,DeviceGray,18.0,,,,"(0,)",,Y,,1,' ) def test_cli_csv_include(self): From 5caaefb7af55a1bf79b7e8511ac1ebab71ee007e Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 5 Jan 2025 13:28:58 -0500 Subject: [PATCH 33/34] fix: correct the "size" of rotated glyphs --- pdfplumber/page.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 54c45eff..6d3065ed 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -381,13 +381,12 @@ def process_object(self, content_object: ContentObject) -> T_obj: ) obj["render_mode"] = textstate.render_mode # Lazy API does not do this stuff for you - # NOTE: This is not right at all for rotated text, but we'll live with it if textstate.font is not None: obj["fontname"] = textstate.font.fontname if textstate.font.vertical: - obj["size"] = obj["width"] + obj["size"] = textstate.fontsize * content_object.matrix[0] else: - obj["size"] = obj["height"] + obj["size"] = textstate.fontsize * content_object.matrix[3] matrix = mult_matrix(textstate.line_matrix, content_object.ctm) matrix = translate_matrix(matrix, textstate.glyph_offset) obj["matrix"] = matrix From c9d3848d30a5d5e1c5646b75648f880b9f47a281 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 7 Jan 2025 10:36:18 -0500 Subject: [PATCH 34/34] fix(tests): new and more correct text objects --- tests/test_convert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_convert.py b/tests/test_convert.py index 2e595aca..37a9cc3e 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -179,7 +179,7 @@ def test_csv(self): "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,,0," '"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"' - ',,,DeviceGray,"(0,)",,,,0,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' + ',,,DeviceRGB,"(0, 0, 0)",,,,0,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) io = StringIO() @@ -256,7 +256,7 @@ def test_cli_csv(self): "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,,0," '"(1.0, 0.0, 0.0, 1.0, 45.83, 660.69)"' - ',,,DeviceGray,"(0,)",,,,0,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' + ',,,DeviceRGB,"(0, 0, 0)",,,,0,DeviceGray,18.0,,,,"(0,)",,,Y,,1,' ) def test_cli_csv_exclude(self): @@ -290,7 +290,7 @@ def test_cli_csv_exclude(self): assert lines[9] == ( "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996," "18.0,12.996,,,,,,,TimesNewRomanPSMT,," - '0,,"(0,)",,,0,DeviceGray,18.0,,,,"(0,)",,Y,,1,' + '0,,"(0, 0, 0)",,,0,DeviceGray,18.0,,,,"(0,)",,Y,,1,' ) def test_cli_csv_include(self):