From 0f229f05561b87e57106af18055de2efa81a9581 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?=
 <alvarojusten@gmail.com>
Date: Fri, 12 Apr 2019 01:20:35 -0300
Subject: [PATCH 1/4] Add backend args to pdf_table_lines

---
 rows/plugins/plugin_pdf.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rows/plugins/plugin_pdf.py b/rows/plugins/plugin_pdf.py
index 80fa8ebc..d8561841 100644
--- a/rows/plugins/plugin_pdf.py
+++ b/rows/plugins/plugin_pdf.py
@@ -714,13 +714,17 @@ def pdf_table_lines(
     x_threshold=0.5,
     y_threshold=0.5,
     backend=None,
+    backend_args=None,
+    backend_kwargs=None,
 ):
     backend = backend or default_backend()
 
     # TODO: check if both backends accepts filename or fobj
     Backend = get_backend(backend)
     Algorithm = get_algorithm(algorithm)
-    pdf_doc = Backend(filename_or_fobj)
+    backend_args = backend_args or []
+    backend_kwargs = backend_kwargs or {}
+    pdf_doc = Backend(filename_or_fobj, *backend_args, **backend_kwargs)
 
     pages = pdf_doc.objects(
         page_numbers=page_numbers, starts_after=starts_after, ends_before=ends_before

From c0cdd2e43e80e2146c92cd142d94fd6d2ea27849 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?=
 <alvarojusten@gmail.com>
Date: Fri, 12 Apr 2019 01:24:35 -0300
Subject: [PATCH 2/4] Add draft OCR implementation using pytesseract

---
 rows/__init__.py           |   3 ++
 rows/plugins/__init__.py   |   5 ++
 rows/plugins/plugin_ocr.py | 100 +++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 rows/plugins/plugin_ocr.py

diff --git a/rows/__init__.py b/rows/__init__.py
index 90211297..efc0505a 100644
--- a/rows/__init__.py
+++ b/rows/__init__.py
@@ -74,5 +74,8 @@
 if plugins.pdf:
     import_from_pdf = plugins.pdf.import_from_pdf
 
+if plugins.ocr:
+    import_from_image = plugins.ocr.import_from_image
+
 
 __version__ = "0.4.2dev0"
diff --git a/rows/plugins/__init__.py b/rows/plugins/__init__.py
index 9bb2933f..838edf97 100644
--- a/rows/plugins/__init__.py
+++ b/rows/plugins/__init__.py
@@ -64,3 +64,8 @@
     from . import plugin_pdf as pdf
 except ImportError:
     pdf = None
+
+try:
+    from . import plugin_ocr as ocr
+except ImportError:
+    ocr = None
diff --git a/rows/plugins/plugin_ocr.py b/rows/plugins/plugin_ocr.py
new file mode 100644
index 00000000..89501427
--- /dev/null
+++ b/rows/plugins/plugin_ocr.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+
+# Copyright 2014-2019 Álvaro Justen <https://github.com/turicas/rows/>
+
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU Lesser General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Lesser General Public License for more details.
+
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import unicode_literals
+
+from cached_property import cached_property
+from pytesseract import image_to_boxes
+from PIL import Image
+
+from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
+from rows.plugins.utils import create_table
+
+
+class TesseractBackend(PDFBackend):
+
+    name = "tesseract"
+
+    def __init__(self, filename_or_fobj, language):
+        self.filename_or_fobj = filename_or_fobj
+        self.language = language
+        super().__init__(self.filename_or_fobj)
+
+    @cached_property
+    def document(self):
+        if hasattr(self.filename_or_fobj, "read"):
+            image = Image.open(self.filename_or_fobj)
+        else:
+            image = self.filename_or_fobj
+
+        return image
+
+    @cached_property
+    def number_of_pages(self):
+        return 1  # TODO: fix
+
+    def extract_text(self, page_numbers=None):
+        return ""  # TODO: image_to_string
+
+    def objects(self, page_numbers=None, starts_after=None, ends_before=None):
+        header = "char left bottom right top page".split()
+        boxes = image_to_boxes(self.document, lang=self.language).splitlines()
+        text_objs = []
+        for box in boxes:
+            row = {}
+            for key, value in zip(header, box.split()):
+                if key != "char":
+                    value = int(value)
+                row[key] = value
+            obj = TextObject(
+                x0=row["left"],
+                y0=row["bottom"],
+                x1=row["right"],
+                y1=row["top"],
+                text=row["char"],
+            )
+            text_objs.append(obj)
+
+        text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
+        # TODO: group contiguous objects before yielding
+        yield text_objs
+
+    text_objects = objects
+
+
+def import_from_image(
+    filename_or_fobj,
+    language="eng",
+    algorithm="y-groups",
+    x_threshold=1.0,
+    y_threshold=1.0,
+    *args,
+    **kwargs
+):
+    meta = {"imported_from": "image"}
+    table_rows = pdf_table_lines(
+        filename_or_fobj,
+        None,
+        starts_after=None,
+        ends_before=None,
+        algorithm=algorithm,
+        x_threshold=x_threshold,
+        y_threshold=y_threshold,
+        backend=TesseractBackend,
+        backend_kwargs={"language": language},
+    )
+    return create_table(table_rows, meta=meta, *args, **kwargs)

From 0348e8fd92adbcdb47ab63668c8bfb6901944dd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?=
 <alvarojusten@gmail.com>
Date: Fri, 12 Apr 2019 01:24:57 -0300
Subject: [PATCH 3/4] Add OCR tests draft

---
 tests/tests_plugin_ocr.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 tests/tests_plugin_ocr.py

diff --git a/tests/tests_plugin_ocr.py b/tests/tests_plugin_ocr.py
new file mode 100644
index 00000000..56ea0cd2
--- /dev/null
+++ b/tests/tests_plugin_ocr.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+
+# Copyright 2014-2019 Álvaro Justen <https://github.com/turicas/rows/>
+
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU Lesser General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Lesser General Public License for more details.
+
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import unicode_literals
+
+import unittest
+
+import rows
+
+import tests.utils as utils
+
+
+class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase):
+
+    plugin_name = "ocr"
+    file_extension = "png"
+    filename = "tests/data/all-field-types.png"
+
+    def test_imports(self):
+        self.assertIs(rows.import_from_image, rows.plugins.ocr.import_from_image)
+
+    def basic_test(self):
+        table = rows.import_from_image(self.filename)
+        # TODO: assert

From 3816f159b74d797ccc17a9f06c2bd580e92740da Mon Sep 17 00:00:00 2001
From: Joao S O Bueno <gwidion@gmail.com>
Date: Tue, 16 Apr 2019 03:23:35 -0300
Subject: [PATCH 4/4] Adds code to merge contiguous rectangular areas

---
 rows/plugins/plugin_ocr.py |  6 ++-
 rows/plugins/utils_rect.py | 98 ++++++++++++++++++++++++++++++++++++++
 tests/tests_plugin_ocr.py  | 24 ++++++++++
 3 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 rows/plugins/utils_rect.py

diff --git a/rows/plugins/plugin_ocr.py b/rows/plugins/plugin_ocr.py
index 89501427..cd1357f5 100644
--- a/rows/plugins/plugin_ocr.py
+++ b/rows/plugins/plugin_ocr.py
@@ -23,6 +23,7 @@
 
 from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
 from rows.plugins.utils import create_table
+from rows.plugins.utils_rect import join_contiguous_rects
 
 
 class TesseractBackend(PDFBackend):
@@ -54,6 +55,7 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None):
         header = "char left bottom right top page".split()
         boxes = image_to_boxes(self.document, lang=self.language).splitlines()
         text_objs = []
+        max_width = 0
         for box in boxes:
             row = {}
             for key, value in zip(header, box.split()):
@@ -68,9 +70,11 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None):
                 text=row["char"],
             )
             text_objs.append(obj)
+            max_width = max(max_width, row["right"] - row["left"])
 
         text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
-        # TODO: group contiguous objects before yielding
+        #  group contiguous objects before yielding
+        text_objs = join_contiguous_rects(text_objs, tolerance=max_width)
         yield text_objs
 
     text_objects = objects
diff --git a/rows/plugins/utils_rect.py b/rows/plugins/utils_rect.py
new file mode 100644
index 00000000..3a14a13f
--- /dev/null
+++ b/rows/plugins/utils_rect.py
@@ -0,0 +1,98 @@
+from copy import copy
+
+
+SIDES = "left top right bottom".split()
+
+
+class Rect:
+    def __init__(self, rect):
+        #rect = {key: value for key, value in rect.items() if key in SIDES}
+        self.__dict__.update(rect)
+
+    def __hash__(self):
+        return hash((self.left, self.top, self.right, self.bottom))
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __setitem__(self, item, value):
+        self.__dict__[item] = value
+
+    def __eq__(self, other):
+        return all(self[side] == other[side] for side in SIDES)
+
+    def __repr__(self):
+        return "<{left}, {top}, {right}, {bottom}>".format(**self.__dict__)
+
+
+def consolidate(new_rect, rect1, rect2):
+    if new_rect is None:
+        new_rect = copy(rect1)
+    for op, side in zip((min, max, max, min), SIDES):
+        new_rect[side] = op(r[side] for r in (new_rect, rect1, rect2))
+
+    return new_rect
+
+
+def mag(x, y):
+    return x ** 2 + y ** 2
+
+
+def find_paired_rects(rects, tolerance):
+
+    rects_by_left = {}
+    for r in rects:
+        rects_by_left.setdefault(r.left, []).append(r)
+
+    left_right_pairs = []
+    paired = {}
+    for rect in rects:
+        mag_alignment = None
+        for offset_x in range(-tolerance, tolerance + 1):
+            if (rect.right + offset_x) not in rects_by_left:
+                continue
+            for aligned_rect in rects_by_left[rect.right + offset_x]:
+                if aligned_rect is rect: continue
+                for offset_y in range(-tolerance, tolerance + 1):
+                    if (rect.top + offset_y) == aligned_rect.top:
+                        new_mag = mag(offset_x, offset_y)
+                        if mag_alignment is None or new_mag < mag_alignment:
+                            paired[rect] = aligned_rect
+                            mag_alignment = new_mag
+
+    return paired
+
+
+def join_contiguous_rects(rect_dicts, tolerance=1):
+    rects = [Rect(rect) for rect in rect_dicts]
+
+    paired = find_paired_rects(rects, tolerance)
+
+    consolidated = []
+    to_remove = set()
+
+    for rect in sorted(rects, key=lambda r:r.left):
+        if rect in to_remove:
+            continue
+        new_rect = None
+        chars = ""
+        while rect in paired:
+            chars += rect.char
+            new_rect = consolidate(new_rect, rect, paired[rect])
+            to_remove.add(rect)
+            rect = paired[rect]
+
+        chars += rect.char
+        if new_rect:
+            new_rect.char = chars
+            to_remove.add(rect)
+            consolidated.append(new_rect)
+
+    result = [
+        r.__dict__ for r in sorted(
+            consolidated + [rect for rect in rects if rect not in to_remove],
+            key= lambda r: (-r.top, r.left)
+        )
+    ]
+    return result
+
diff --git a/tests/tests_plugin_ocr.py b/tests/tests_plugin_ocr.py
index 56ea0cd2..94e84f98 100644
--- a/tests/tests_plugin_ocr.py
+++ b/tests/tests_plugin_ocr.py
@@ -20,9 +20,24 @@
 import unittest
 
 import rows
+from rows.plugins.utils_rect import join_contiguous_rects
 
 import tests.utils as utils
 
+test_data = [
+    {'char': 'R', 'left': 1282.0, 'bottom': 52.0, 'right': 1284.0, 'top': 63.0, 'page': 0.0},
+    {'char': 'S', 'left': 1284.0, 'bottom': 52.0, 'right': 1295.0, 'top': 63.0, 'page': 0.0},
+    {'char': '2', 'left': 1302.0, 'bottom': 52.0, 'right': 1303.0, 'top': 63.0, 'page': 0.0},
+    {'char': '5', 'left': 1303.0, 'bottom': 52.0, 'right': 1309.0, 'top': 63.0, 'page': 0.0},
+    {'char': '.', 'left': 1312.0, 'bottom': 53.0, 'right': 1317.0, 'top': 63.0, 'page': 0.0},
+    {'char': '0', 'left': 1319.0, 'bottom': 53.0, 'right': 1321.0, 'top': 56.0, 'page': 0.0},
+    {'char': '0', 'left': 1326.0, 'bottom': 53.0, 'right': 1334.0, 'top': 64.0, 'page': 0.0},
+    {'char': '0', 'left': 1334.0, 'bottom': 53.0, 'right': 1338.0, 'top': 64.0, 'page': 0.0},
+    {'char': ',', 'left': 1338.0, 'bottom': 53.0, 'right': 1343.0, 'top': 64.0, 'page': 0.0},
+    {'char': '0', 'left': 1344.0, 'bottom': 51.0, 'right': 1347.0, 'top': 56.0, 'page': 0.0},
+    {'char': '0', 'left': 1352.0, 'bottom': 53.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0},
+]
+
 
 class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase):
 
@@ -36,3 +51,12 @@ def test_imports(self):
     def basic_test(self):
         table = rows.import_from_image(self.filename)
         # TODO: assert
+
+
+class TestRectUtils(unittest.TestCase):
+
+    def test_join_contiguous_rects(self):
+        self.assertEquals(
+            join_contiguous_rects(test_data, 10),
+            [{'char': 'RS25.000,00', 'left': 1282.0, 'bottom': 51.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0}]
+        )