-
Notifications
You must be signed in to change notification settings - Fork 688
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Completely overhauls the approach to table extraction. - Adds visual debugging. - See CHANGELOG.md for details.
- Loading branch information
Showing
24 changed files
with
2,606 additions
and
524 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# Change Log | ||
|
||
All notable changes to this project will be documented in this file. Currently goes back to `v0.4.3`. | ||
|
||
The format is based on [Keep a Changelog](http://keepachangelog.com/). | ||
|
||
## [0.5.0] - 2017-02-25 | ||
### Added | ||
- Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.) | ||
- More powerful options for extracting data from tables. See changes below. | ||
|
||
### Changed | ||
- Entirely overhaul the table-extraction methods. Now based on [Anssi Nurminen's master's thesis](http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3). | ||
- Disentangle `.crop` from `.intersects_bbox` and `.within_bbox`. | ||
- Change default `x_tolerance` and `y_tolerance` for word extraction from `5` to `3` | ||
|
||
### Fixed | ||
- Fix bug stemming from non-decimalized page heights. [h/t @jsfenfen] | ||
|
||
## [0.4.6] - 2017-01-26 | ||
### Added | ||
- Provide access to `Page.page_number` | ||
|
||
### Changed | ||
- Use `.page_number` instead of `.page_id` as primary identifier. [h/t @jsfenfen] | ||
- Change default `x_tolerance` and `y_tolerance` for word extraction from `0` to `5` | ||
|
||
### Fixed | ||
- Provide proper support for rotated pages | ||
|
||
## [0.4.5] - 2016-12-09 | ||
### Fixed | ||
- Fix bug stemming from when metadata includes a PostScript literal. [h/t @boblannon] | ||
|
||
|
||
## [0.4.4] - Mistakenly skipped | ||
|
||
Whoops. | ||
|
||
## [0.4.3] - 2016-04-12 | ||
### Changed | ||
- When extracting table cells, use chars' midpoints instead of top-points. | ||
|
||
### Fixed | ||
- Fix find_gutters — should ignore `" "` chars |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
version_info = (0, 5, 0) | ||
__version__ = '.'.join(map(str, version_info)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
import PIL.Image | ||
import PIL.ImageDraw | ||
import wand.image | ||
import sys, os | ||
from io import BytesIO | ||
from pdfplumber import utils | ||
from pdfplumber.table import TableFinder | ||
|
||
class COLORS(object): | ||
RED = (255, 0, 0) | ||
GREEN = (0, 255, 0) | ||
BLUE = (0, 0, 255) | ||
TRANSPARENT = (0, 0, 0, 0) | ||
|
||
DEFAULT_FILL = COLORS.BLUE + (50,) | ||
DEFAULT_STROKE = COLORS.RED + (200,) | ||
DEFAULT_STROKE_WIDTH = 1 | ||
DEFAULT_RESOLUTION = 72 | ||
|
||
def get_page_image(pdf_path, page_no, resolution): | ||
""" | ||
For kwargs, see http://docs.wand-py.org/en/latest/wand/image.html#wand.image.Image | ||
""" | ||
page_path = "{0}[{1}]".format(pdf_path, page_no) | ||
with wand.image.Image(filename=page_path, resolution=resolution) as img: | ||
with img.convert("png") as png: | ||
im = PIL.Image.open(BytesIO(png.make_blob())).convert("RGB") | ||
return im | ||
|
||
class PageImage(object): | ||
def __init__(self, page, original=None, resolution=DEFAULT_RESOLUTION): | ||
self.page = page | ||
if original == None: | ||
self.original = get_page_image( | ||
page.pdf.stream.name, | ||
page.page_number - 1, | ||
resolution | ||
) | ||
else: | ||
self.original = original | ||
|
||
d = self.page.decimalize | ||
self.decimalize = d | ||
if page.is_original: | ||
self.root = page | ||
cropped = False | ||
else: | ||
self.root = page.root_page | ||
cropped = page.root_page.bbox != page.bbox | ||
self.scale = d(self.original.size[0]) / d(self.root.width) | ||
if cropped: | ||
cropbox = ( | ||
(page.bbox[0] - page.root_page.bbox[0]) * self.scale, | ||
(page.bbox[1] - page.root_page.bbox[1]) * self.scale, | ||
(page.bbox[2] - page.root_page.bbox[0]) * self.scale, | ||
(page.bbox[3] - page.root_page.bbox[1]) * self.scale, | ||
) | ||
self.original = self.original.crop(map(int, cropbox)) | ||
self.reset() | ||
|
||
def _reproject_bbox(self, bbox): | ||
x0, top, x1, bottom = bbox | ||
_x0, _top = self._reproject((x0, top)) | ||
_x1, _bottom = self._reproject((x1, bottom)) | ||
return (_x0, _top, _x1, _bottom) | ||
|
||
def _reproject(self, coord): | ||
"""Given an (x0, top) tuple from the *root* coordinate system, | ||
return an (x0, top) tuple in the *image* coordinate system. | ||
""" | ||
x0, top = coord | ||
px0, ptop = self.page.bbox[:2] | ||
rx0, rtop = self.root.bbox[:2] | ||
_x0 = (x0 + rx0 - px0) * self.scale | ||
_top = (top + rtop - ptop) * self.scale | ||
return (_x0, _top) | ||
|
||
def reset(self): | ||
self.annotated = PIL.Image.new(self.original.mode, self.original.size) | ||
self.annotated.paste(self.original) | ||
self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA") | ||
return self | ||
|
||
def copy(self): | ||
return self.__class__(self.page, self.original) | ||
|
||
def draw_line(self, points_or_line, | ||
stroke=DEFAULT_STROKE, | ||
stroke_width=DEFAULT_STROKE_WIDTH): | ||
if isinstance(points_or_line, (tuple, list)): | ||
points = points_or_line | ||
else: | ||
obj = points_or_line | ||
points = (obj["x0"], obj["top"], obj["x1"], obj["bottom"]) | ||
self.draw.line( | ||
self._reproject_bbox(points), | ||
fill=stroke, | ||
width=stroke_width | ||
) | ||
return self | ||
|
||
def draw_lines(self, list_of_lines, **kwargs): | ||
for x in list_of_lines: | ||
self.draw_line(x, **kwargs) | ||
return self | ||
|
||
def draw_rect(self, bbox_or_obj, | ||
fill=DEFAULT_FILL, | ||
stroke=DEFAULT_STROKE, | ||
stroke_width=DEFAULT_STROKE_WIDTH): | ||
if isinstance(bbox_or_obj, (tuple, list)): | ||
bbox = bbox_or_obj | ||
else: | ||
obj = bbox_or_obj | ||
bbox = (obj["x0"], obj["top"], obj["x1"], obj["bottom"]) | ||
|
||
x0, top, x1, bottom = bbox | ||
half = self.decimalize(stroke_width / 2) | ||
x0 += half | ||
top += half | ||
x1 -= half | ||
bottom -= half | ||
|
||
self.draw.rectangle( | ||
self._reproject_bbox((x0, top, x1, bottom)), | ||
fill, | ||
COLORS.TRANSPARENT | ||
) | ||
|
||
if stroke_width > 0: | ||
segments = [ | ||
(x0, top, x1, top), # top | ||
(x0, bottom, x1, bottom), # bottom | ||
(x0, top, x0, bottom), # left | ||
(x1, top, x1, bottom), # right | ||
] | ||
self.draw_lines( | ||
segments, | ||
stroke=stroke, | ||
stroke_width=stroke_width | ||
) | ||
return self | ||
|
||
def draw_rects(self, list_of_rects, **kwargs): | ||
for x in list_of_rects: | ||
self.draw_rect(x, **kwargs) | ||
return self | ||
|
||
def draw_circle(self, center_or_obj, | ||
radius=5, | ||
fill=DEFAULT_FILL, | ||
stroke=DEFAULT_STROKE): | ||
if isinstance(center_or_obj, (tuple, list)): | ||
center = center_or_obj | ||
else: | ||
obj = center_or_obj | ||
center = ( | ||
(obj["x0"] + obj["x1"]) / 2, | ||
(obj["top"] + obj["bottom"]) / 2 | ||
) | ||
cx, cy = center | ||
bbox = (cx - radius, cy - radius, cx + radius, cy + radius) | ||
self.draw.ellipse( | ||
self._reproject_bbox(bbox), | ||
fill, | ||
stroke | ||
) | ||
return self | ||
|
||
def draw_circles(self, list_of_circles, **kwargs): | ||
for x in list_of_circles: | ||
self.draw_circle(x, **kwargs) | ||
return self | ||
|
||
def save(self, *args, **kwargs): | ||
return self.annotated.save(*args, **kwargs) | ||
|
||
def debug_table(self, table, | ||
fill=DEFAULT_FILL, | ||
stroke=DEFAULT_STROKE, | ||
stroke_width=1): | ||
""" | ||
Outline all found tables. | ||
""" | ||
self.draw_rects(table.cells, | ||
fill=fill, | ||
stroke=stroke, | ||
stroke_width=stroke_width) | ||
return self | ||
|
||
def debug_tablefinder(self, tf={}): | ||
if isinstance(tf, TableFinder): | ||
pass | ||
elif isinstance(tf, dict): | ||
tf = self.page.debug_tablefinder(tf) | ||
else: | ||
raise ValueError("Argument must be instance of TableFinder or a TableFinder settings dict.") | ||
|
||
for table in tf.tables: | ||
self.debug_table(table) | ||
|
||
self.draw_lines(tf.edges, stroke_width=1) | ||
|
||
self.draw_circles(tf.intersections.keys(), | ||
fill=COLORS.TRANSPARENT, | ||
stroke=COLORS.BLUE + (200,), | ||
radius=3) | ||
return self | ||
|
||
def outline_words(self, | ||
stroke=DEFAULT_STROKE, | ||
fill=DEFAULT_FILL, | ||
stroke_width=DEFAULT_STROKE_WIDTH, | ||
x_tolerance=utils.DEFAULT_X_TOLERANCE, | ||
y_tolerance=utils.DEFAULT_Y_TOLERANCE): | ||
|
||
words = self.page.extract_words(x_tolerance=x_tolerance, y_tolerance=y_tolerance) | ||
self.draw_rects(words, stroke=stroke, fill=fill, stroke_width=stroke_width) | ||
return self | ||
|
||
def outline_chars(self, | ||
stroke=(255, 0, 0, 255), | ||
fill=(255, 0, 0, int(255/4)), | ||
stroke_width=DEFAULT_STROKE_WIDTH): | ||
|
||
self.draw_rects(self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width) | ||
return self | ||
|
||
def _repr_png_(self): | ||
b = BytesIO() | ||
self.annotated.save(b, 'PNG') | ||
return b.getvalue() | ||
|
Oops, something went wrong.