From 96d7a2af058030a33685e2a77548dbd5a6a7ea4f Mon Sep 17 00:00:00 2001
From: Jeremy Singer-Vine <jsvine@gmail.com>
Date: Sun, 26 Feb 2017 10:44:28 -0500
Subject: [PATCH] v0.5.1; minor fixes/tweaks, adds line quick-draw

- Quick-draw `PageImage` methods: `.draw_vline`, `.draw_vlines`, `.draw_hline`, and `.draw_hlines`.
- Boolean parameter `keep_blank_chars` for `.extract_words(...)` and `TableFinder` settings.

- Increased default `text_tolerance` and `intersection_tolerance` TableFinder values from 1 to 3.

- Properly handle conversion of PDFs with transparency to `pillow` images.
- Properly handle `pandas` DataFrames as inputs to multi-draw commands (e.g., `PageImage.draw_rects(...)`).
---
 CHANGELOG.md           | 12 +++++++++++
 README.md              | 20 ++++++++++--------
 pdfplumber/_version.py |  2 +-
 pdfplumber/display.py  | 46 +++++++++++++++++++++++++++++++++++++-----
 pdfplumber/page.py     |  3 ++-
 pdfplumber/table.py    | 24 +++++++++-------------
 pdfplumber/utils.py    |  6 ++++--
 7 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a4dfa469..a6f92c5c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. Currently g
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
+## [0.5.1] — 2017-02-26
+### Added
+- Quick-draw `PageImage` methods: `.draw_vline`, `.draw_vlines`, `.draw_hline`, and `.draw_hlines`.
+- Boolean parameter `keep_blank_chars` for `.extract_words(...)` and `TableFinder` settings.
+
+### Changed
+- Increased default `text_tolerance` and `intersection_tolerance` TableFinder values from 1 to 3.
+
+### Fixed
+- Properly handle conversion of PDFs with transparency to `pillow` images.
+- Properly handle `pandas` DataFrames as inputs to multi-draw commands (e.g., `PageImage.draw_rects(...)`).
+
 ## [0.5.0] - 2017-02-25
 ### Added
 - Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.)
diff --git a/README.md b/README.md
index 038f19ab..743413fb 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# PDFPlumber `v0.5.0`
+# PDFPlumber `v0.5.1`
 
 Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging.
 
@@ -189,11 +189,13 @@ im = my_pdf.page[0].to_image(resolution=150)
 
 You can pass explicit coordinates or any `pdfplumber` PDF object (e.g., char, line, rect) to these methods.
 
-| Single-object method | Bulk method |
-|----------------------|-------------|
-|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`|
-|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`|
-|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`|
+| Single-object method | Bulk method | Description |
+|----------------------|-------------|-------------|
+|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`| Draws a line from a `line`-like object, or a 4-tuple bounding box.|
+|`im.draw_vline(location, stroke={color}, stroke_width=1)`| `im.draw_vlines(list_of_locations, **kwargs)`| Draws a vertical line at the x-coordinate indicated by `location`.|
+|`im.draw_hline(location, stroke={color}, stroke_width=1)`| `im.draw_hlines(list_of_locations, **kwargs)`| Draws a vertical line at the y-coordinate indicated by `location`.|
+|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`| Draws a rectangle from a `rect`, `char`, etc., or 4-tuple bounding box.|
+|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`| Draws a circle at `(x, y)` coordinate or at the center of a `char`, `rect`, etc.|
 
 Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillow.readthedocs.io/en/latest/reference/ImageDraw.html), but the parameters have been tweaked for consistency with SVG's `fill`/`stroke`/`stroke_width` nomenclature.
 
@@ -242,10 +244,11 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
     "join_tolerance": 3,
     "edge_min_length": 3,
     "text_word_threshold": 3,
-    "text_tolerance": 1,
+    "keep_blank_chars": False,
+    "text_tolerance": 3,
     "text_x_tolerance": None,
     "text_y_tolerance": None,
-    "intersection_tolerance": 1,
+    "intersection_tolerance": 3,
     "intersection_x_tolerance": None,
     "intersection_y_tolerance": None,
 }
@@ -261,6 +264,7 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
 |`"join_tolerance"`| Line segments on the same infinite line, and whose ends are within `join_tolerance` of one another, will be "joined" into a single line segment.|
 |`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.|
 |`"text_word_threshold"`| When using the `text` strategy, at least `text_word_threshold` words must share the same alignment.|
+|`"keep_blank_chars"`| When using the `text` strategy, consider `" "` chars to be *parts* of words and not word-separators.|
 |`"text_tolerance"`, `"text_x_tolerance"`, `"text_y_tolerance"`| When the `text` strategy searches for words, it will expect the individual letters in each word to be no more than `text_tolerance` pixels apart.|
 |`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges most be within `intersection_tolerance` pixels to be considered intersecting.|
 
diff --git a/pdfplumber/_version.py b/pdfplumber/_version.py
index 9d9f4801..0ee93b94 100644
--- a/pdfplumber/_version.py
+++ b/pdfplumber/_version.py
@@ -1,2 +1,2 @@
-version_info = (0, 5, 0)
+version_info = (0, 5, 1)
 __version__ = '.'.join(map(str, version_info))
diff --git a/pdfplumber/display.py b/pdfplumber/display.py
index 47361bc0..c3ec7590 100644
--- a/pdfplumber/display.py
+++ b/pdfplumber/display.py
@@ -24,8 +24,12 @@ def get_page_image(pdf_path, page_no, resolution):
     page_path = "{0}[{1}]".format(pdf_path, page_no)
     with wand.image.Image(filename=page_path, resolution=resolution) as img:
         with img.convert("png") as png:
-            im = PIL.Image.open(BytesIO(png.make_blob())).convert("RGB")
-            return im
+            im = PIL.Image.open(BytesIO(png.make_blob()))
+            if "transparency" in im.info:
+                converted = im.convert("RGBA").convert("RGB")
+            else:
+                converted = im.convert("RGB")
+            return converted
 
 class PageImage(object):
     def __init__(self, page, original=None, resolution=DEFAULT_RESOLUTION):
@@ -100,10 +104,42 @@ def draw_line(self, points_or_line,
         return self
 
     def draw_lines(self, list_of_lines, **kwargs):
-        for x in list_of_lines:
+        for x in utils.to_list(list_of_lines):
             self.draw_line(x, **kwargs)
         return self
         
+    def draw_vline(self, location,
+        stroke=DEFAULT_STROKE,
+        stroke_width=DEFAULT_STROKE_WIDTH):
+        points = (location, self.page.bbox[1], location, self.page.bbox[3])
+        self.draw.line(
+            self._reproject_bbox(points),
+            fill=stroke,
+            width=stroke_width
+        )
+        return self
+
+    def draw_vlines(self, locations, **kwargs):
+        for x in utils.to_list(locations):
+            self.draw_vline(x, **kwargs)
+        return self
+        
+    def draw_hline(self, location,
+        stroke=DEFAULT_STROKE,
+        stroke_width=DEFAULT_STROKE_WIDTH):
+        points = (self.page.bbox[0], location, self.page.bbox[2], location)
+        self.draw.line(
+            self._reproject_bbox(points),
+            fill=stroke,
+            width=stroke_width
+        )
+        return self
+
+    def draw_hlines(self, locations, **kwargs):
+        for x in utils.to_list(locations):
+            self.draw_hline(x, **kwargs)
+        return self
+        
     def draw_rect(self, bbox_or_obj,
         fill=DEFAULT_FILL,
         stroke=DEFAULT_STROKE,
@@ -142,7 +178,7 @@ def draw_rect(self, bbox_or_obj,
         return self
 
     def draw_rects(self, list_of_rects, **kwargs):
-        for x in list_of_rects:
+        for x in utils.to_list(list_of_rects):
             self.draw_rect(x, **kwargs)
         return self
 
@@ -168,7 +204,7 @@ def draw_circle(self, center_or_obj,
         return self
 
     def draw_circles(self, list_of_circles, **kwargs):
-        for x in list_of_circles:
+        for x in utils.to_list(list_of_circles):
             self.draw_circle(x, **kwargs)
         return self
 
diff --git a/pdfplumber/page.py b/pdfplumber/page.py
index 00156eeb..c249f7ab 100644
--- a/pdfplumber/page.py
+++ b/pdfplumber/page.py
@@ -127,7 +127,8 @@ def extract_text(self,
 
     def extract_words(self,
         x_tolerance=utils.DEFAULT_X_TOLERANCE,
-        y_tolerance=utils.DEFAULT_Y_TOLERANCE):
+        y_tolerance=utils.DEFAULT_Y_TOLERANCE,
+        keep_blank_chars=False):
 
         return utils.extract_words(self.chars,
             x_tolerance=x_tolerance,
diff --git a/pdfplumber/table.py b/pdfplumber/table.py
index b9f622e3..d5210e7e 100644
--- a/pdfplumber/table.py
+++ b/pdfplumber/table.py
@@ -87,9 +87,7 @@ def get_group(edge):
     return edges
 
 def words_to_edges_h(words,
-    word_threshold=3,
-    join_tolerance=DEFAULT_JOIN_TOLERANCE,
-    snap_tolerance=DEFAULT_SNAP_TOLERANCE):
+    word_threshold=3):
     """
     Find (imaginary) horizontal lines that connect the tops of at least `word_threshold` words.
     """
@@ -116,14 +114,10 @@ def words_to_edges_h(words,
         "orientation": "h"
     } for r in rects ]
 
-    return merge_edges(edges,
-        join_tolerance=join_tolerance,
-        snap_tolerance=snap_tolerance)
+    return edges
 
 def words_to_edges_v(words,
-    word_threshold=3,
-    join_tolerance=DEFAULT_JOIN_TOLERANCE,
-    snap_tolerance=DEFAULT_SNAP_TOLERANCE):
+    word_threshold=3):
     """
     Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
     """
@@ -185,9 +179,7 @@ def words_to_edges_v(words,
         "orientation": "v"
     } ]
     
-    return merge_edges(edges,
-        join_tolerance=join_tolerance,
-        snap_tolerance=snap_tolerance)
+    return edges
 
 def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1):
     """
@@ -404,10 +396,11 @@ def char_in_bbox(char, bbox):
     "join_tolerance": DEFAULT_JOIN_TOLERANCE,
     "edge_min_length": 3,
     "text_word_threshold": 3,
-    "text_tolerance": 1,
+    "keep_blank_chars": False,
+    "text_tolerance": 3,
     "text_x_tolerance": None,
     "text_y_tolerance": None,
-    "intersection_tolerance": 1,
+    "intersection_tolerance": 3,
     "intersection_x_tolerance": None,
     "intersection_y_tolerance": None,
 }
@@ -479,7 +472,8 @@ def get_edges(self):
                 yt = settings["text_tolerance"]
             words = self.page.extract_words(
                 x_tolerance=xt,
-                y_tolerance=yt
+                y_tolerance=yt,
+                keep_blank_chars=settings["keep_blank_chars"]
             )
 
         def v_edge_desc_to_edge(desc):
diff --git a/pdfplumber/utils.py b/pdfplumber/utils.py
index 151b5b2d..91df7c3a 100644
--- a/pdfplumber/utils.py
+++ b/pdfplumber/utils.py
@@ -130,7 +130,9 @@ def bbox_to_rect(bbox):
 
 def extract_words(chars,
     x_tolerance=DEFAULT_X_TOLERANCE,
-    y_tolerance=DEFAULT_Y_TOLERANCE):
+    y_tolerance=DEFAULT_Y_TOLERANCE,
+    keep_blank_chars=False
+    ):
 
     x_tolerance = decimalize(x_tolerance)
     y_tolerance = decimalize(y_tolerance)
@@ -153,7 +155,7 @@ def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
         current_word = []
 
         for char in chars_sorted:
-            if get_text(char) == " ":
+            if not keep_blank_chars and get_text(char) == " ":
                 if len(current_word) > 0:
                     words.append(current_word)
                     current_word = []