Skip to content

Commit

Permalink
Add --laparams to CLI (and make related tweaks)
Browse files Browse the repository at this point in the history
This commit adds an `--laparams` flag to the pdfplumber CLI, giving it
more feature parity with the core library. To do so, it makes some
internal changes to `convert.py`, including changing the list of objects
to convert from *a predefined default list* to *all types extracted*.
  • Loading branch information
jsvine authored and Shawn committed Apr 26, 2021
1 parent 0c4c7f2 commit 9927ebc
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 30 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ The output will be a CSV containing info about every character, line, and rectan
|----------|-------------|
|`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.|
|`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`. Defaults to all.|
|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`, et cetera. Defaults to all available.|
|`--laparams`| A JSON-formatted string (e.g., `'{"detect_vertical": true}'`) to pass to `pdfplumber.open(..., laparams=...)`.|

## Python library

Expand Down
14 changes: 9 additions & 5 deletions pdfplumber/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .pdf import PDF
import argparse
from itertools import chain
import json
import sys


Expand All @@ -23,13 +24,16 @@ def parse_args(args_raw):

parser.add_argument("--format", choices=["csv", "json"], default="csv")

parser.add_argument("--types", nargs="+")

parser.add_argument(
"--types",
nargs="+",
default=convert.DEFAULT_TYPES,
choices=convert.DEFAULT_TYPES,
"--all-types",
action="store_true",
help="Return all types of objects. Overrides --types.",
)

parser.add_argument("--laparams", type=json.loads)

parser.add_argument("--pages", nargs="+", type=parse_page_spec)

parser.add_argument(
Expand All @@ -46,7 +50,7 @@ def main(args_raw=sys.argv[1:]):
args = parse_args(args_raw)
converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format]
kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format]
with PDF.open(args.infile, pages=args.pages) as pdf:
with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
converter(pdf, sys.stdout, args.types, **kwargs)


Expand Down
16 changes: 16 additions & 0 deletions pdfplumber/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,22 @@ def images(self):
def chars(self):
return self.objects.get("char", [])

@property
def textboxverticals(self):
return self.objects.get("textboxvertical", [])

@property
def textboxhorizontals(self):
return self.objects.get("textboxhorizontal", [])

@property
def textlineverticals(self):
return self.objects.get("textlinevertical", [])

@property
def textlinehorizontals(self):
return self.objects.get("textlinehorizontal", [])

@property
def rect_edges(self):
if hasattr(self, "_rect_edges"):
Expand Down
36 changes: 17 additions & 19 deletions pdfplumber/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,6 @@
import base64
from io import StringIO

DEFAULT_TYPES = [
"char",
"rect",
"line",
"curve",
"image",
"annot",
]

COLS_TO_PREPEND = [
"object_type",
"page_number",
Expand Down Expand Up @@ -85,7 +76,10 @@ def serialize(obj):
return str(obj)


def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None):
def to_json(container, stream=None, types=None, indent=None):
if types is None:
types = list(container.objects.keys()) + ["annot"]

def page_to_dict(page):
d = {
"page_number": page.page_number,
Expand Down Expand Up @@ -117,23 +111,27 @@ def page_to_dict(page):
return json.dump(serialized, stream, indent=indent)


def to_csv(container, stream=None, types=DEFAULT_TYPES):
def to_csv(container, stream=None, types=None):
if stream is None:
stream = StringIO()
to_string = True
else:
to_string = False

objs = []
if types is None:
types = list(container.objects.keys()) + ["annot"]

# Determine set of fields for all objects
objs = []
fields = set()
for t in types:
new_objs = getattr(container, t + "s")
if len(new_objs):
objs += new_objs
new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
fields = fields.union(set(new_keys))

pages = container.pages if hasattr(container, "pages") else [container]
for page in pages:
for t in types:
new_objs = getattr(page, t + "s")
if len(new_objs):
objs += new_objs
new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
fields = fields.union(set(new_keys))

cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND)))

Expand Down
15 changes: 14 additions & 1 deletion tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ def test_json(self):
self.pdf.pages[0].rects[0]["bottom"]
)

def test_json_all_types(self):
c = json.loads(self.pdf.to_json(types=None))
found_types = c["pages"][0].keys()
assert "curves" in found_types
assert "chars" in found_types
assert "lines" in found_types
assert "rects" in found_types
assert "images" in found_types

def test_single_pages(self):
c = json.loads(self.pdf.pages[0].to_json())
assert c["rects"][0]["bottom"] == float(self.pdf.pages[0].rects[0]["bottom"])
Expand All @@ -46,7 +55,7 @@ def test_additional_attr_types(self):

def test_csv(self):
c = self.pdf.to_csv()
assert c.split("\r\n")[1] == (
assert c.split("\r\n")[2] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
'18.0,12.996,,,,,,TimesNewRomanPSMT,,,,"(0, 0, 0)",,,18.0,,,,,Y,,1,'
)
Expand All @@ -57,6 +66,10 @@ def test_csv(self):
c_from_io = io.read()
assert c == c_from_io

def test_csv_all_types(self):
c = self.pdf.to_csv(types=None)
assert c.split("\r\n")[1].split(",")[0] == "curve"

def test_cli(self):
res = run(
[
Expand Down
19 changes: 15 additions & 4 deletions tests/test_laparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,21 @@ def test_without_laparams(self):

def test_with_laparams(self):
with pdfplumber.open(self.path, laparams={}) as pdf:
objs = pdf.pages[0].objects
assert len(objs["textboxhorizontal"]) == 21
assert len(objs["char"]) == 4408
assert "anno" not in objs.keys()
page = pdf.pages[0]
assert len(page.textboxhorizontals) == 21
assert len(page.textlinehorizontals) == 79
assert len(page.chars) == 4408
assert "anno" not in page.objects.keys()

def test_vertical_texts(self):
path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
laparams = {"detect_vertical": True}
with pdfplumber.open(path, laparams=laparams) as pdf:
page = pdf.pages[0]
assert len(page.textlinehorizontals) == 142
assert len(page.textboxhorizontals) == 74
assert len(page.textlineverticals) == 11
assert len(page.textboxverticals) == 6

def test_issue_383(self):
with pdfplumber.open(self.path, laparams={}) as pdf:
Expand Down

0 comments on commit 9927ebc

Please sign in to comment.