Add --laparams to CLI (and make related tweaks)

This commit adds an `--laparams` flag to the pdfplumber CLI, giving it more feature parity with the core library. To do so, it makes some internal changes to `convert.py`, including changing the list of objects to convert from *a predefined default list* to *all types extracted*.
jsvine · Apr 26, 2021 · 9927ebc · 9927ebc
1 parent 0c4c7f2
commit 9927ebc
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -46,7 +46,8 @@ The output will be a CSV containing info about every character, line, and rectan
 |----------|-------------|
 |`--format [format]`| `csv` or `json`. The `json` format returns more information; it includes PDF-level and page-level metadata, plus dictionary-nested attributes.|
 |`--pages [list of pages]`| A space-delimited, `1`-indexed list of pages or hyphenated page ranges. E.g., `1, 11-15`, which would return data for pages 1, 11, 12, 13, 14, and 15.|
-|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`. Defaults to all.|
+|`--types [list of object types to extract]`| Choices are `char`, `rect`, `line`, `curve`, `image`, `annot`, et cetera. Defaults to all available.|
+|`--laparams`| A JSON-formatted string (e.g., `'{"detect_vertical": true}'`) to pass to `pdfplumber.open(..., laparams=...)`.|
 
 ## Python library
 

diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py
@@ -3,6 +3,7 @@
 from .pdf import PDF
 import argparse
 from itertools import chain
+import json
 import sys
 
 
@@ -23,13 +24,16 @@ def parse_args(args_raw):
 
     parser.add_argument("--format", choices=["csv", "json"], default="csv")
 
+    parser.add_argument("--types", nargs="+")
+
     parser.add_argument(
-        "--types",
-        nargs="+",
-        default=convert.DEFAULT_TYPES,
-        choices=convert.DEFAULT_TYPES,
+        "--all-types",
+        action="store_true",
+        help="Return all types of objects. Overrides --types.",
     )
 
+    parser.add_argument("--laparams", type=json.loads)
+
     parser.add_argument("--pages", nargs="+", type=parse_page_spec)
 
     parser.add_argument(
@@ -46,7 +50,7 @@ def main(args_raw=sys.argv[1:]):
     args = parse_args(args_raw)
     converter = {"csv": convert.to_csv, "json": convert.to_json}[args.format]
     kwargs = {"csv": {}, "json": {"indent": args.indent}}[args.format]
-    with PDF.open(args.infile, pages=args.pages) as pdf:
+    with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
         converter(pdf, sys.stdout, args.types, **kwargs)
 
 

diff --git a/pdfplumber/container.py b/pdfplumber/container.py
@@ -47,6 +47,22 @@ def images(self):
     def chars(self):
         return self.objects.get("char", [])
 
+    @property
+    def textboxverticals(self):
+        return self.objects.get("textboxvertical", [])
+
+    @property
+    def textboxhorizontals(self):
+        return self.objects.get("textboxhorizontal", [])
+
+    @property
+    def textlineverticals(self):
+        return self.objects.get("textlinevertical", [])
+
+    @property
+    def textlinehorizontals(self):
+        return self.objects.get("textlinehorizontal", [])
+
     @property
     def rect_edges(self):
         if hasattr(self, "_rect_edges"):

diff --git a/pdfplumber/convert.py b/pdfplumber/convert.py
@@ -7,15 +7,6 @@
 import base64
 from io import StringIO
 
-DEFAULT_TYPES = [
-    "char",
-    "rect",
-    "line",
-    "curve",
-    "image",
-    "annot",
-]
-
 COLS_TO_PREPEND = [
     "object_type",
     "page_number",
@@ -85,7 +76,10 @@ def serialize(obj):
         return str(obj)
 
 
-def to_json(container, stream=None, types=DEFAULT_TYPES, indent=None):
+def to_json(container, stream=None, types=None, indent=None):
+    if types is None:
+        types = list(container.objects.keys()) + ["annot"]
+
     def page_to_dict(page):
         d = {
             "page_number": page.page_number,
@@ -117,23 +111,27 @@ def page_to_dict(page):
         return json.dump(serialized, stream, indent=indent)
 
 
-def to_csv(container, stream=None, types=DEFAULT_TYPES):
+def to_csv(container, stream=None, types=None):
     if stream is None:
         stream = StringIO()
         to_string = True
     else:
         to_string = False
 
-    objs = []
+    if types is None:
+        types = list(container.objects.keys()) + ["annot"]
 
-    # Determine set of fields for all objects
+    objs = []
     fields = set()
-    for t in types:
-        new_objs = getattr(container, t + "s")
-        if len(new_objs):
-            objs += new_objs
-            new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
-            fields = fields.union(set(new_keys))
+
+    pages = container.pages if hasattr(container, "pages") else [container]
+    for page in pages:
+        for t in types:
+            new_objs = getattr(page, t + "s")
+            if len(new_objs):
+                objs += new_objs
+                new_keys = [k for k, v in new_objs[0].items() if type(v) is not dict]
+                fields = fields.union(set(new_keys))
 
     cols = COLS_TO_PREPEND + list(sorted(set(fields) - set(COLS_TO_PREPEND)))
 

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -34,6 +34,15 @@ def test_json(self):
             self.pdf.pages[0].rects[0]["bottom"]
         )
 
+    def test_json_all_types(self):
+        c = json.loads(self.pdf.to_json(types=None))
+        found_types = c["pages"][0].keys()
+        assert "curves" in found_types
+        assert "chars" in found_types
+        assert "lines" in found_types
+        assert "rects" in found_types
+        assert "images" in found_types
+
     def test_single_pages(self):
         c = json.loads(self.pdf.pages[0].to_json())
         assert c["rects"][0]["bottom"] == float(self.pdf.pages[0].rects[0]["bottom"])
@@ -46,7 +55,7 @@ def test_additional_attr_types(self):
 
     def test_csv(self):
         c = self.pdf.to_csv()
-        assert c.split("\r\n")[1] == (
+        assert c.split("\r\n")[2] == (
             "char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
             '18.0,12.996,,,,,,TimesNewRomanPSMT,,,,"(0, 0, 0)",,,18.0,,,,,Y,,1,'
         )
@@ -57,6 +66,10 @@ def test_csv(self):
         c_from_io = io.read()
         assert c == c_from_io
 
+    def test_csv_all_types(self):
+        c = self.pdf.to_csv(types=None)
+        assert c.split("\r\n")[1].split(",")[0] == "curve"
+
     def test_cli(self):
         res = run(
             [

diff --git a/tests/test_laparams.py b/tests/test_laparams.py
@@ -23,10 +23,21 @@ def test_without_laparams(self):
 
     def test_with_laparams(self):
         with pdfplumber.open(self.path, laparams={}) as pdf:
-            objs = pdf.pages[0].objects
-            assert len(objs["textboxhorizontal"]) == 21
-            assert len(objs["char"]) == 4408
-            assert "anno" not in objs.keys()
+            page = pdf.pages[0]
+            assert len(page.textboxhorizontals) == 21
+            assert len(page.textlinehorizontals) == 79
+            assert len(page.chars) == 4408
+            assert "anno" not in page.objects.keys()
+
+    def test_vertical_texts(self):
+        path = os.path.join(HERE, "pdfs/issue-192-example.pdf")
+        laparams = {"detect_vertical": True}
+        with pdfplumber.open(path, laparams=laparams) as pdf:
+            page = pdf.pages[0]
+            assert len(page.textlinehorizontals) == 142
+            assert len(page.textboxhorizontals) == 74
+            assert len(page.textlineverticals) == 11
+            assert len(page.textboxverticals) == 6
 
     def test_issue_383(self):
         with pdfplumber.open(self.path, laparams={}) as pdf: