Skip to content

Commit

Permalink
v0.5.0
Browse files Browse the repository at this point in the history
- Completely overhauls the approach to table extraction.
- Adds visual debugging.
- See CHANGELOG.md for details.
  • Loading branch information
jsvine committed Feb 25, 2017
1 parent d5afcf6 commit 955b126
Show file tree
Hide file tree
Showing 24 changed files with 2,606 additions and 524 deletions.
45 changes: 45 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Change Log

All notable changes to this project will be documented in this file. Currently goes back to `v0.4.3`.

The format is based on [Keep a Changelog](http://keepachangelog.com/).

## [0.5.0] - 2017-02-25
### Added
- Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.)
- More powerful options for extracting data from tables. See changes below.

### Changed
- Entirely overhaul the table-extraction methods. Now based on [Anssi Nurminen's master's thesis](http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3).
- Disentangle `.crop` from `.intersects_bbox` and `.within_bbox`.
- Change default `x_tolerance` and `y_tolerance` for word extraction from `5` to `3`

### Fixed
- Fix bug stemming from non-decimalized page heights. [h/t @jsfenfen]

## [0.4.6] - 2017-01-26
### Added
- Provide access to `Page.page_number`

### Changed
- Use `.page_number` instead of `.page_id` as primary identifier. [h/t @jsfenfen]
- Change default `x_tolerance` and `y_tolerance` for word extraction from `0` to `5`

### Fixed
- Provide proper support for rotated pages

## [0.4.5] - 2016-12-09
### Fixed
- Fix bug stemming from when metadata includes a PostScript literal. [h/t @boblannon]


## [0.4.4] - Mistakenly skipped

Whoops.

## [0.4.3] - 2016-04-12
### Changed
- When extracting table cells, use chars' midpoints instead of top-points.

### Fixed
- Fix find_gutters — should ignore `" "` chars
306 changes: 207 additions & 99 deletions README.md

Large diffs are not rendered by default.

759 changes: 759 additions & 0 deletions examples/notebooks/extract-table-ca-warn-report.ipynb

Large diffs are not rendered by default.

413 changes: 327 additions & 86 deletions examples/notebooks/extract-table-nics.ipynb

Large diffs are not rendered by default.

Binary file added examples/pdfs/ca-warn-report.pdf
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 0 additions & 3 deletions pdfplumber/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
pdfminer.pdftypes.STRICT = False
pdfminer.pdfinterp.STRICT = False

VERSION_TUPLE = (0, 4, 6)
VERSION = ".".join(map(str, VERSION_TUPLE))

def load(file_or_buffer, **kwargs):
return PDF(file_or_buffer, **kwargs)

Expand Down
2 changes: 2 additions & 0 deletions pdfplumber/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
version_info = (0, 5, 0)
__version__ = '.'.join(map(str, version_info))
5 changes: 3 additions & 2 deletions pdfplumber/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ def parse_args():
parser.add_argument("--encoding",
default="utf-8")

TYPE_DEFAULTS = [ "char", "anno", "line", "curve", "rect" ]
parser.add_argument("--types", nargs="+",
choices=[ "char", "anno", "line", "rect", "rect_edge" ],
default=[ "char", "anno", "line", "rect" ])
choices=TYPE_DEFAULTS + [ "rect_edge" ],
default=TYPE_DEFAULTS)

parser.add_argument("--pages", nargs="+",
type=parse_page_spec)
Expand Down
52 changes: 16 additions & 36 deletions pdfplumber/container.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,6 @@
from itertools import chain
from pdfplumber import utils

def rect_to_edges(rect):
top, bottom, left, right = [ dict(rect) for x in range(4) ]
top.update({
"object_type": "rect_edge",
"height": 0,
"y0": rect["y1"],
"orientation": "h"
})
bottom.update({
"object_type": "rect_edge",
"height": 0,
"doctop": rect["doctop"] + rect["height"],
"y1": rect["y0"],
"orientation": "h"
})
left.update({
"object_type": "rect_edge",
"width": 0,
"x1": rect["x0"],
"orientation": "v"
})
right.update({
"object_type": "rect_edge",
"width": 0,
"x0": rect["x1"],
"orientation": "v"
})
return [ top, bottom, left, right ]

def line_to_edge(line):
edge = dict(line)
edge["orientation"] = "h" if (line["y0"] == line["y1"]) else "v"
return edge

class Container(object):
cached_properties = [ "_rect_edges", "_edges", "_objects" ]

Expand All @@ -52,6 +18,10 @@ def rects(self):
def lines(self):
return self.objects.get("line", [])

@property
def curves(self):
return self.objects.get("curve", [])

@property
def images(self):
return self.objects.get("image", [])
Expand All @@ -71,13 +41,23 @@ def annos(self):
@property
def rect_edges(self):
if hasattr(self, "_rect_edges"): return self._edges
rect_edges_gen = (rect_to_edges(r) for r in self.rects)
rect_edges_gen = (utils.rect_to_edges(r) for r in self.rects)
self._rect_edges = list(chain(*rect_edges_gen))
return self._rect_edges

@property
def edges(self):
if hasattr(self, "_edges"): return self._edges
line_edges = list(map(line_to_edge, self.lines))
line_edges = list(map(utils.line_to_edge, self.lines))
self._edges = self.rect_edges + line_edges
return self._edges

@property
def horizontal_edges(self):
test = lambda x: x["orientation"] == "h"
return list(filter(test, self.edges))

@property
def vertical_edges(self):
test = lambda x: x["orientation"] == "v"
return list(filter(test, self.edges))
233 changes: 233 additions & 0 deletions pdfplumber/display.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import PIL.Image
import PIL.ImageDraw
import wand.image
import sys, os
from io import BytesIO
from pdfplumber import utils
from pdfplumber.table import TableFinder

class COLORS(object):
RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)
TRANSPARENT = (0, 0, 0, 0)

DEFAULT_FILL = COLORS.BLUE + (50,)
DEFAULT_STROKE = COLORS.RED + (200,)
DEFAULT_STROKE_WIDTH = 1
DEFAULT_RESOLUTION = 72

def get_page_image(pdf_path, page_no, resolution):
"""
For kwargs, see http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image
"""
page_path = "{0}[{1}]".format(pdf_path, page_no)
with wand.image.Image(filename=page_path, resolution=resolution) as img:
with img.convert("png") as png:
im = PIL.Image.open(BytesIO(png.make_blob())).convert("RGB")
return im

class PageImage(object):
def __init__(self, page, original=None, resolution=DEFAULT_RESOLUTION):
self.page = page
if original == None:
self.original = get_page_image(
page.pdf.stream.name,
page.page_number - 1,
resolution
)
else:
self.original = original

d = self.page.decimalize
self.decimalize = d
if page.is_original:
self.root = page
cropped = False
else:
self.root = page.root_page
cropped = page.root_page.bbox != page.bbox
self.scale = d(self.original.size[0]) / d(self.root.width)
if cropped:
cropbox = (
(page.bbox[0] - page.root_page.bbox[0]) * self.scale,
(page.bbox[1] - page.root_page.bbox[1]) * self.scale,
(page.bbox[2] - page.root_page.bbox[0]) * self.scale,
(page.bbox[3] - page.root_page.bbox[1]) * self.scale,
)
self.original = self.original.crop(map(int, cropbox))
self.reset()

def _reproject_bbox(self, bbox):
x0, top, x1, bottom = bbox
_x0, _top = self._reproject((x0, top))
_x1, _bottom = self._reproject((x1, bottom))
return (_x0, _top, _x1, _bottom)

def _reproject(self, coord):
"""Given an (x0, top) tuple from the *root* coordinate system,
return an (x0, top) tuple in the *image* coordinate system.
"""
x0, top = coord
px0, ptop = self.page.bbox[:2]
rx0, rtop = self.root.bbox[:2]
_x0 = (x0 + rx0 - px0) * self.scale
_top = (top + rtop - ptop) * self.scale
return (_x0, _top)

def reset(self):
self.annotated = PIL.Image.new(self.original.mode, self.original.size)
self.annotated.paste(self.original)
self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA")
return self

def copy(self):
return self.__class__(self.page, self.original)

def draw_line(self, points_or_line,
stroke=DEFAULT_STROKE,
stroke_width=DEFAULT_STROKE_WIDTH):
if isinstance(points_or_line, (tuple, list)):
points = points_or_line
else:
obj = points_or_line
points = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
self.draw.line(
self._reproject_bbox(points),
fill=stroke,
width=stroke_width
)
return self

def draw_lines(self, list_of_lines, **kwargs):
for x in list_of_lines:
self.draw_line(x, **kwargs)
return self

def draw_rect(self, bbox_or_obj,
fill=DEFAULT_FILL,
stroke=DEFAULT_STROKE,
stroke_width=DEFAULT_STROKE_WIDTH):
if isinstance(bbox_or_obj, (tuple, list)):
bbox = bbox_or_obj
else:
obj = bbox_or_obj
bbox = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])

x0, top, x1, bottom = bbox
half = self.decimalize(stroke_width / 2)
x0 += half
top += half
x1 -= half
bottom -= half

self.draw.rectangle(
self._reproject_bbox((x0, top, x1, bottom)),
fill,
COLORS.TRANSPARENT
)

if stroke_width > 0:
segments = [
(x0, top, x1, top), # top
(x0, bottom, x1, bottom), # bottom
(x0, top, x0, bottom), # left
(x1, top, x1, bottom), # right
]
self.draw_lines(
segments,
stroke=stroke,
stroke_width=stroke_width
)
return self

def draw_rects(self, list_of_rects, **kwargs):
for x in list_of_rects:
self.draw_rect(x, **kwargs)
return self

def draw_circle(self, center_or_obj,
radius=5,
fill=DEFAULT_FILL,
stroke=DEFAULT_STROKE):
if isinstance(center_or_obj, (tuple, list)):
center = center_or_obj
else:
obj = center_or_obj
center = (
(obj["x0"] + obj["x1"]) / 2,
(obj["top"] + obj["bottom"]) / 2
)
cx, cy = center
bbox = (cx - radius, cy - radius, cx + radius, cy + radius)
self.draw.ellipse(
self._reproject_bbox(bbox),
fill,
stroke
)
return self

def draw_circles(self, list_of_circles, **kwargs):
for x in list_of_circles:
self.draw_circle(x, **kwargs)
return self

def save(self, *args, **kwargs):
return self.annotated.save(*args, **kwargs)

def debug_table(self, table,
fill=DEFAULT_FILL,
stroke=DEFAULT_STROKE,
stroke_width=1):
"""
Outline all found tables.
"""
self.draw_rects(table.cells,
fill=fill,
stroke=stroke,
stroke_width=stroke_width)
return self

def debug_tablefinder(self, tf={}):
if isinstance(tf, TableFinder):
pass
elif isinstance(tf, dict):
tf = self.page.debug_tablefinder(tf)
else:
raise ValueError("Argument must be instance of TableFinder or a TableFinder settings dict.")

for table in tf.tables:
self.debug_table(table)

self.draw_lines(tf.edges, stroke_width=1)

self.draw_circles(tf.intersections.keys(),
fill=COLORS.TRANSPARENT,
stroke=COLORS.BLUE + (200,),
radius=3)
return self

def outline_words(self,
stroke=DEFAULT_STROKE,
fill=DEFAULT_FILL,
stroke_width=DEFAULT_STROKE_WIDTH,
x_tolerance=utils.DEFAULT_X_TOLERANCE,
y_tolerance=utils.DEFAULT_Y_TOLERANCE):

words = self.page.extract_words(x_tolerance=x_tolerance, y_tolerance=y_tolerance)
self.draw_rects(words, stroke=stroke, fill=fill, stroke_width=stroke_width)
return self

def outline_chars(self,
stroke=(255, 0, 0, 255),
fill=(255, 0, 0, int(255/4)),
stroke_width=DEFAULT_STROKE_WIDTH):

self.draw_rects(self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width)
return self

def _repr_png_(self):
b = BytesIO()
self.annotated.save(b, 'PNG')
return b.getvalue()

Loading

0 comments on commit 955b126

Please sign in to comment.