v0.5.0

- Completely overhauls the approach to table extraction. - Adds visual debugging. - See CHANGELOG.md for details.
jsvine · Feb 25, 2017 · 955b126 · 955b126
1 parent d5afcf6
commit 955b126
Show file tree

Hide file tree

Showing 24 changed files with 2,606 additions and 524 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,45 @@
+# Change Log
+
+All notable changes to this project will be documented in this file. Currently goes back to `v0.4.3`.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/).
+
+## [0.5.0] - 2017-02-25
+### Added
+- Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.)
+- More powerful options for extracting data from tables. See changes below.
+
+### Changed
+- Entirely overhaul the table-extraction methods. Now based on [Anssi Nurminen's master's thesis](http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3).
+- Disentangle `.crop` from `.intersects_bbox` and `.within_bbox`.
+- Change default `x_tolerance` and `y_tolerance` for word extraction from `5` to `3`
+
+### Fixed
+- Fix bug stemming from non-decimalized page heights. [h/t @jsfenfen]
+
+## [0.4.6] - 2017-01-26
+### Added
+- Provide access to `Page.page_number`
+
+### Changed
+- Use `.page_number` instead of `.page_id` as primary identifier. [h/t @jsfenfen]
+- Change default `x_tolerance` and `y_tolerance` for word extraction from `0` to `5`
+
+### Fixed
+- Provide proper support for rotated pages
+
+## [0.4.5] - 2016-12-09
+### Fixed
+- Fix bug stemming from when metadata includes a PostScript literal. [h/t @boblannon]
+
+
+## [0.4.4] - Mistakenly skipped
+
+Whoops.
+
+## [0.4.3] - 2016-04-12
+### Changed
+- When extracting table cells, use chars' midpoints instead of top-points.
+
+### Fixed
+- Fix find_gutters — should ignore `" "` chars
diff --git a/README.md b/README.md
diff --git a/examples/notebooks/extract-table-ca-warn-report.ipynb b/examples/notebooks/extract-table-ca-warn-report.ipynb
diff --git a/examples/notebooks/extract-table-nics.ipynb b/examples/notebooks/extract-table-nics.ipynb
diff --git a/examples/pdfs/ca-warn-report.pdf b/examples/pdfs/ca-warn-report.pdf
diff --git a/examples/screenshots/visual-debugging-in-jupyter.png b/examples/screenshots/visual-debugging-in-jupyter.png
diff --git a/pdfplumber/__init__.py b/pdfplumber/__init__.py
@@ -5,9 +5,6 @@
 pdfminer.pdftypes.STRICT = False
 pdfminer.pdfinterp.STRICT = False
 
-VERSION_TUPLE = (0, 4, 6)
-VERSION = ".".join(map(str, VERSION_TUPLE))
-
 def load(file_or_buffer, **kwargs):
     return PDF(file_or_buffer, **kwargs)
 

diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py
@@ -0,0 +1,2 @@
+version_info = (0, 5, 0)
+__version__ = '.'.join(map(str, version_info))
diff --git a/pdfplumber/cli.py b/pdfplumber/cli.py
@@ -34,9 +34,10 @@ def parse_args():
     parser.add_argument("--encoding",
         default="utf-8")
 
+    TYPE_DEFAULTS = [ "char", "anno", "line", "curve", "rect" ]
     parser.add_argument("--types", nargs="+",
-        choices=[ "char", "anno", "line", "rect", "rect_edge" ],
-        default=[ "char", "anno", "line", "rect" ])
+        choices=TYPE_DEFAULTS + [ "rect_edge" ],
+        default=TYPE_DEFAULTS)
 
     parser.add_argument("--pages", nargs="+",
         type=parse_page_spec)

diff --git a/pdfplumber/container.py b/pdfplumber/container.py
@@ -1,40 +1,6 @@
 from itertools import chain
 from pdfplumber import utils
 
-def rect_to_edges(rect):
-    top, bottom, left, right = [ dict(rect) for x in range(4) ]
-    top.update({
-        "object_type": "rect_edge",
-        "height": 0,
-        "y0": rect["y1"],
-        "orientation": "h"
-    })
-    bottom.update({
-        "object_type": "rect_edge",
-        "height": 0,
-        "doctop": rect["doctop"] + rect["height"],
-        "y1": rect["y0"],
-        "orientation": "h"
-    })
-    left.update({
-        "object_type": "rect_edge",
-        "width": 0,
-        "x1": rect["x0"],
-        "orientation": "v"
-    })
-    right.update({
-        "object_type": "rect_edge",
-        "width": 0,
-        "x0": rect["x1"],
-        "orientation": "v"
-    })
-    return [ top, bottom, left, right ]
-
-def line_to_edge(line):
-    edge = dict(line)
-    edge["orientation"] = "h" if (line["y0"] == line["y1"]) else "v"
-    return edge
-
 class Container(object):
     cached_properties = [ "_rect_edges", "_edges", "_objects" ]
 
@@ -52,6 +18,10 @@ def rects(self):
     def lines(self):
         return self.objects.get("line", [])
 
+    @property
+    def curves(self):
+        return self.objects.get("curve", [])
+
     @property
     def images(self):
         return self.objects.get("image", [])
@@ -71,13 +41,23 @@ def annos(self):
     @property
     def rect_edges(self):
         if hasattr(self, "_rect_edges"): return self._edges
-        rect_edges_gen = (rect_to_edges(r) for r in self.rects)
+        rect_edges_gen = (utils.rect_to_edges(r) for r in self.rects)
         self._rect_edges = list(chain(*rect_edges_gen))
         return self._rect_edges
 
     @property
     def edges(self):
         if hasattr(self, "_edges"): return self._edges
-        line_edges = list(map(line_to_edge, self.lines))
+        line_edges = list(map(utils.line_to_edge, self.lines))
         self._edges = self.rect_edges + line_edges
         return self._edges
+
+    @property
+    def horizontal_edges(self):
+        test = lambda x: x["orientation"] == "h"
+        return list(filter(test, self.edges))
+
+    @property
+    def vertical_edges(self):
+        test = lambda x: x["orientation"] == "v"
+        return list(filter(test, self.edges))
diff --git a/pdfplumber/display.py b/pdfplumber/display.py
@@ -0,0 +1,233 @@
+import PIL.Image
+import PIL.ImageDraw
+import wand.image
+import sys, os
+from io import BytesIO
+from pdfplumber import utils
+from pdfplumber.table import TableFinder
+
+class COLORS(object):
+    RED = (255, 0, 0)
+    GREEN = (0, 255, 0)
+    BLUE = (0, 0, 255)
+    TRANSPARENT = (0, 0, 0, 0)
+
+DEFAULT_FILL = COLORS.BLUE + (50,)
+DEFAULT_STROKE = COLORS.RED + (200,)
+DEFAULT_STROKE_WIDTH = 1
+DEFAULT_RESOLUTION = 72
+
+def get_page_image(pdf_path, page_no, resolution):
+    """
+    For kwargs, see http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image
+    """
+    page_path = "{0}[{1}]".format(pdf_path, page_no)
+    with wand.image.Image(filename=page_path, resolution=resolution) as img:
+        with img.convert("png") as png:
+            im = PIL.Image.open(BytesIO(png.make_blob())).convert("RGB")
+            return im
+
+class PageImage(object):
+    def __init__(self, page, original=None, resolution=DEFAULT_RESOLUTION):
+        self.page = page
+        if original == None:
+            self.original = get_page_image(
+                page.pdf.stream.name,
+                page.page_number - 1,
+                resolution
+            )
+        else:
+            self.original = original
+
+        d = self.page.decimalize
+        self.decimalize = d
+        if page.is_original:
+            self.root = page
+            cropped = False
+        else:
+            self.root = page.root_page
+            cropped = page.root_page.bbox != page.bbox
+        self.scale = d(self.original.size[0]) / d(self.root.width)
+        if cropped:
+            cropbox = (
+                (page.bbox[0] - page.root_page.bbox[0]) * self.scale,
+                (page.bbox[1] - page.root_page.bbox[1]) * self.scale,
+                (page.bbox[2] - page.root_page.bbox[0]) * self.scale,
+                (page.bbox[3] - page.root_page.bbox[1]) * self.scale,
+            )
+            self.original = self.original.crop(map(int, cropbox))
+        self.reset()
+
+    def _reproject_bbox(self, bbox):
+        x0, top, x1, bottom = bbox
+        _x0, _top = self._reproject((x0, top))
+        _x1, _bottom = self._reproject((x1, bottom))
+        return (_x0, _top, _x1, _bottom)
+
+    def _reproject(self, coord):
+        """Given an (x0, top) tuple from the *root* coordinate system,
+        return an (x0, top) tuple in the *image* coordinate system.
+        """
+        x0, top = coord
+        px0, ptop = self.page.bbox[:2]
+        rx0, rtop = self.root.bbox[:2]
+        _x0 = (x0 + rx0 - px0) * self.scale
+        _top = (top + rtop - ptop) * self.scale
+        return (_x0, _top)
+
+    def reset(self):
+        self.annotated = PIL.Image.new(self.original.mode, self.original.size)
+        self.annotated.paste(self.original)
+        self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA")
+        return self
+
+    def copy(self):
+        return self.__class__(self.page, self.original)
+
+    def draw_line(self, points_or_line,
+        stroke=DEFAULT_STROKE,
+        stroke_width=DEFAULT_STROKE_WIDTH):
+        if isinstance(points_or_line, (tuple, list)):
+            points = points_or_line
+        else:
+            obj = points_or_line
+            points = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
+        self.draw.line(
+            self._reproject_bbox(points),
+            fill=stroke,
+            width=stroke_width
+        )
+        return self
+
+    def draw_lines(self, list_of_lines, **kwargs):
+        for x in list_of_lines:
+            self.draw_line(x, **kwargs)
+        return self
+
+    def draw_rect(self, bbox_or_obj,
+        fill=DEFAULT_FILL,
+        stroke=DEFAULT_STROKE,
+        stroke_width=DEFAULT_STROKE_WIDTH):
+        if isinstance(bbox_or_obj, (tuple, list)):
+            bbox = bbox_or_obj
+        else:
+            obj = bbox_or_obj
+            bbox = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
+
+        x0, top, x1, bottom = bbox
+        half = self.decimalize(stroke_width / 2)
+        x0 += half
+        top += half
+        x1 -= half
+        bottom -= half
+
+        self.draw.rectangle(
+            self._reproject_bbox((x0, top, x1, bottom)),
+            fill,
+            COLORS.TRANSPARENT
+        )
+
+        if stroke_width > 0:
+            segments = [
+                (x0, top, x1, top), # top
+                (x0, bottom, x1, bottom), # bottom
+                (x0, top, x0, bottom), # left
+                (x1, top, x1, bottom), # right
+            ]
+            self.draw_lines(
+                segments,
+                stroke=stroke,
+                stroke_width=stroke_width
+            )
+        return self
+
+    def draw_rects(self, list_of_rects, **kwargs):
+        for x in list_of_rects:
+            self.draw_rect(x, **kwargs)
+        return self
+
+    def draw_circle(self, center_or_obj,
+        radius=5,
+        fill=DEFAULT_FILL,
+        stroke=DEFAULT_STROKE):
+        if isinstance(center_or_obj, (tuple, list)):
+            center = center_or_obj
+        else:
+            obj = center_or_obj
+            center = (
+                (obj["x0"] + obj["x1"]) / 2,
+                (obj["top"] + obj["bottom"]) / 2
+            )
+        cx, cy = center
+        bbox = (cx - radius, cy - radius, cx + radius, cy + radius)
+        self.draw.ellipse(
+            self._reproject_bbox(bbox),
+            fill,
+            stroke
+        )
+        return self
+
+    def draw_circles(self, list_of_circles, **kwargs):
+        for x in list_of_circles:
+            self.draw_circle(x, **kwargs)
+        return self
+
+    def save(self, *args, **kwargs):
+        return self.annotated.save(*args, **kwargs)
+
+    def debug_table(self, table,
+        fill=DEFAULT_FILL,
+        stroke=DEFAULT_STROKE,
+        stroke_width=1):
+        """
+        Outline all found tables.
+        """
+        self.draw_rects(table.cells,
+            fill=fill,
+            stroke=stroke,
+            stroke_width=stroke_width)
+        return self
+
+    def debug_tablefinder(self, tf={}):
+        if isinstance(tf, TableFinder):
+            pass
+        elif isinstance(tf, dict):
+            tf = self.page.debug_tablefinder(tf)
+        else:
+            raise ValueError("Argument must be instance of TableFinder or a TableFinder settings dict.")
+
+        for table in tf.tables:
+            self.debug_table(table)
+
+        self.draw_lines(tf.edges, stroke_width=1)
+
+        self.draw_circles(tf.intersections.keys(),
+            fill=COLORS.TRANSPARENT,
+            stroke=COLORS.BLUE + (200,),
+            radius=3)
+        return self
+
+    def outline_words(self,
+        stroke=DEFAULT_STROKE,
+        fill=DEFAULT_FILL,
+        stroke_width=DEFAULT_STROKE_WIDTH,
+        x_tolerance=utils.DEFAULT_X_TOLERANCE,
+        y_tolerance=utils.DEFAULT_Y_TOLERANCE):
+
+        words = self.page.extract_words(x_tolerance=x_tolerance, y_tolerance=y_tolerance)
+        self.draw_rects(words, stroke=stroke, fill=fill, stroke_width=stroke_width)
+        return self
+
+    def outline_chars(self,
+        stroke=(255, 0, 0, 255),
+        fill=(255, 0, 0, int(255/4)),
+        stroke_width=DEFAULT_STROKE_WIDTH):
+
+        self.draw_rects(self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width)
+        return self
+
+    def _repr_png_(self):
+        b = BytesIO()
+        self.annotated.save(b, 'PNG')
+        return b.getvalue()
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		version_info = (0, 5, 0)
		__version__ = '.'.join(map(str, version_info))