From 96d7a2af058030a33685e2a77548dbd5a6a7ea4f Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Sun, 26 Feb 2017 10:44:28 -0500 Subject: [PATCH] v0.5.1; minor fixes/tweaks, adds line quick-draw - Quick-draw `PageImage` methods: `.draw_vline`, `.draw_vlines`, `.draw_hline`, and `.draw_hlines`. - Boolean parameter `keep_blank_chars` for `.extract_words(...)` and `TableFinder` settings. - Increased default `text_tolerance` and `intersection_tolerance` TableFinder values from 1 to 3. - Properly handle conversion of PDFs with transparency to `pillow` images. - Properly handle `pandas` DataFrames as inputs to multi-draw commands (e.g., `PageImage.draw_rects(...)`). --- CHANGELOG.md | 12 +++++++++++ README.md | 20 ++++++++++-------- pdfplumber/_version.py | 2 +- pdfplumber/display.py | 46 +++++++++++++++++++++++++++++++++++++----- pdfplumber/page.py | 3 ++- pdfplumber/table.py | 24 +++++++++------------- pdfplumber/utils.py | 6 ++++-- 7 files changed, 81 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4dfa469..a6f92c5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. Currently g The format is based on [Keep a Changelog](http://keepachangelog.com/). +## [0.5.1] — 2017-02-26 +### Added +- Quick-draw `PageImage` methods: `.draw_vline`, `.draw_vlines`, `.draw_hline`, and `.draw_hlines`. +- Boolean parameter `keep_blank_chars` for `.extract_words(...)` and `TableFinder` settings. + +### Changed +- Increased default `text_tolerance` and `intersection_tolerance` TableFinder values from 1 to 3. + +### Fixed +- Properly handle conversion of PDFs with transparency to `pillow` images. +- Properly handle `pandas` DataFrames as inputs to multi-draw commands (e.g., `PageImage.draw_rects(...)`). + ## [0.5.0] - 2017-02-25 ### Added - Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.) diff --git a/README.md b/README.md index 038f19ab..743413fb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PDFPlumber `v0.5.0` +# PDFPlumber `v0.5.1` Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging. @@ -189,11 +189,13 @@ im = my_pdf.page[0].to_image(resolution=150) You can pass explicit coordinates or any `pdfplumber` PDF object (e.g., char, line, rect) to these methods. -| Single-object method | Bulk method | -|----------------------|-------------| -|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`| -|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`| -|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`| +| Single-object method | Bulk method | Description | +|----------------------|-------------|-------------| +|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`| Draws a line from a `line`-like object, or a 4-tuple bounding box.| +|`im.draw_vline(location, stroke={color}, stroke_width=1)`| `im.draw_vlines(list_of_locations, **kwargs)`| Draws a vertical line at the x-coordinate indicated by `location`.| +|`im.draw_hline(location, stroke={color}, stroke_width=1)`| `im.draw_hlines(list_of_locations, **kwargs)`| Draws a vertical line at the y-coordinate indicated by `location`.| +|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`| Draws a rectangle from a `rect`, `char`, etc., or 4-tuple bounding box.| +|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`| Draws a circle at `(x, y)` coordinate or at the center of a `char`, `rect`, etc.| Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillow.readthedocs.io/en/latest/reference/ImageDraw.html), but the parameters have been tweaked for consistency with SVG's `fill`/`stroke`/`stroke_width` nomenclature. @@ -242,10 +244,11 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r "join_tolerance": 3, "edge_min_length": 3, "text_word_threshold": 3, - "text_tolerance": 1, + "keep_blank_chars": False, + "text_tolerance": 3, "text_x_tolerance": None, "text_y_tolerance": None, - "intersection_tolerance": 1, + "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, } @@ -261,6 +264,7 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r |`"join_tolerance"`| Line segments on the same infinite line, and whose ends are within `join_tolerance` of one another, will be "joined" into a single line segment.| |`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.| |`"text_word_threshold"`| When using the `text` strategy, at least `text_word_threshold` words must share the same alignment.| +|`"keep_blank_chars"`| When using the `text` strategy, consider `" "` chars to be *parts* of words and not word-separators.| |`"text_tolerance"`, `"text_x_tolerance"`, `"text_y_tolerance"`| When the `text` strategy searches for words, it will expect the individual letters in each word to be no more than `text_tolerance` pixels apart.| |`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges most be within `intersection_tolerance` pixels to be considered intersecting.| diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py index 9d9f4801..0ee93b94 100644 --- a/pdfplumber/_version.py +++ b/pdfplumber/_version.py @@ -1,2 +1,2 @@ -version_info = (0, 5, 0) +version_info = (0, 5, 1) __version__ = '.'.join(map(str, version_info)) diff --git a/pdfplumber/display.py b/pdfplumber/display.py index 47361bc0..c3ec7590 100644 --- a/pdfplumber/display.py +++ b/pdfplumber/display.py @@ -24,8 +24,12 @@ def get_page_image(pdf_path, page_no, resolution): page_path = "{0}[{1}]".format(pdf_path, page_no) with wand.image.Image(filename=page_path, resolution=resolution) as img: with img.convert("png") as png: - im = PIL.Image.open(BytesIO(png.make_blob())).convert("RGB") - return im + im = PIL.Image.open(BytesIO(png.make_blob())) + if "transparency" in im.info: + converted = im.convert("RGBA").convert("RGB") + else: + converted = im.convert("RGB") + return converted class PageImage(object): def __init__(self, page, original=None, resolution=DEFAULT_RESOLUTION): @@ -100,10 +104,42 @@ def draw_line(self, points_or_line, return self def draw_lines(self, list_of_lines, **kwargs): - for x in list_of_lines: + for x in utils.to_list(list_of_lines): self.draw_line(x, **kwargs) return self + def draw_vline(self, location, + stroke=DEFAULT_STROKE, + stroke_width=DEFAULT_STROKE_WIDTH): + points = (location, self.page.bbox[1], location, self.page.bbox[3]) + self.draw.line( + self._reproject_bbox(points), + fill=stroke, + width=stroke_width + ) + return self + + def draw_vlines(self, locations, **kwargs): + for x in utils.to_list(locations): + self.draw_vline(x, **kwargs) + return self + + def draw_hline(self, location, + stroke=DEFAULT_STROKE, + stroke_width=DEFAULT_STROKE_WIDTH): + points = (self.page.bbox[0], location, self.page.bbox[2], location) + self.draw.line( + self._reproject_bbox(points), + fill=stroke, + width=stroke_width + ) + return self + + def draw_hlines(self, locations, **kwargs): + for x in utils.to_list(locations): + self.draw_hline(x, **kwargs) + return self + def draw_rect(self, bbox_or_obj, fill=DEFAULT_FILL, stroke=DEFAULT_STROKE, @@ -142,7 +178,7 @@ def draw_rect(self, bbox_or_obj, return self def draw_rects(self, list_of_rects, **kwargs): - for x in list_of_rects: + for x in utils.to_list(list_of_rects): self.draw_rect(x, **kwargs) return self @@ -168,7 +204,7 @@ def draw_circle(self, center_or_obj, return self def draw_circles(self, list_of_circles, **kwargs): - for x in list_of_circles: + for x in utils.to_list(list_of_circles): self.draw_circle(x, **kwargs) return self diff --git a/pdfplumber/page.py b/pdfplumber/page.py index 00156eeb..c249f7ab 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -127,7 +127,8 @@ def extract_text(self, def extract_words(self, x_tolerance=utils.DEFAULT_X_TOLERANCE, - y_tolerance=utils.DEFAULT_Y_TOLERANCE): + y_tolerance=utils.DEFAULT_Y_TOLERANCE, + keep_blank_chars=False): return utils.extract_words(self.chars, x_tolerance=x_tolerance, diff --git a/pdfplumber/table.py b/pdfplumber/table.py index b9f622e3..d5210e7e 100644 --- a/pdfplumber/table.py +++ b/pdfplumber/table.py @@ -87,9 +87,7 @@ def get_group(edge): return edges def words_to_edges_h(words, - word_threshold=3, - join_tolerance=DEFAULT_JOIN_TOLERANCE, - snap_tolerance=DEFAULT_SNAP_TOLERANCE): + word_threshold=3): """ Find (imaginary) horizontal lines that connect the tops of at least `word_threshold` words. """ @@ -116,14 +114,10 @@ def words_to_edges_h(words, "orientation": "h" } for r in rects ] - return merge_edges(edges, - join_tolerance=join_tolerance, - snap_tolerance=snap_tolerance) + return edges def words_to_edges_v(words, - word_threshold=3, - join_tolerance=DEFAULT_JOIN_TOLERANCE, - snap_tolerance=DEFAULT_SNAP_TOLERANCE): + word_threshold=3): """ Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words. """ @@ -185,9 +179,7 @@ def words_to_edges_v(words, "orientation": "v" } ] - return merge_edges(edges, - join_tolerance=join_tolerance, - snap_tolerance=snap_tolerance) + return edges def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1): """ @@ -404,10 +396,11 @@ def char_in_bbox(char, bbox): "join_tolerance": DEFAULT_JOIN_TOLERANCE, "edge_min_length": 3, "text_word_threshold": 3, - "text_tolerance": 1, + "keep_blank_chars": False, + "text_tolerance": 3, "text_x_tolerance": None, "text_y_tolerance": None, - "intersection_tolerance": 1, + "intersection_tolerance": 3, "intersection_x_tolerance": None, "intersection_y_tolerance": None, } @@ -479,7 +472,8 @@ def get_edges(self): yt = settings["text_tolerance"] words = self.page.extract_words( x_tolerance=xt, - y_tolerance=yt + y_tolerance=yt, + keep_blank_chars=settings["keep_blank_chars"] ) def v_edge_desc_to_edge(desc): diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py index 151b5b2d..91df7c3a 100644 --- a/pdfplumber/utils.py +++ b/pdfplumber/utils.py @@ -130,7 +130,9 @@ def bbox_to_rect(bbox): def extract_words(chars, x_tolerance=DEFAULT_X_TOLERANCE, - y_tolerance=DEFAULT_Y_TOLERANCE): + y_tolerance=DEFAULT_Y_TOLERANCE, + keep_blank_chars=False + ): x_tolerance = decimalize(x_tolerance) y_tolerance = decimalize(y_tolerance) @@ -153,7 +155,7 @@ def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE): current_word = [] for char in chars_sorted: - if get_text(char) == " ": + if not keep_blank_chars and get_text(char) == " ": if len(current_word) > 0: words.append(current_word) current_word = []