turicas · jsbueno · Apr 12, 2019 · Apr 12, 2019 · Apr 12, 2019 · Apr 16, 2019
diff --git a/rows/__init__.py b/rows/__init__.py
@@ -74,5 +74,8 @@
 if plugins.pdf:
     import_from_pdf = plugins.pdf.import_from_pdf
 
+if plugins.ocr:
+    import_from_image = plugins.ocr.import_from_image
+
 
 __version__ = "0.4.2dev0"
diff --git a/rows/plugins/__init__.py b/rows/plugins/__init__.py
@@ -64,3 +64,8 @@
     from . import plugin_pdf as pdf
 except ImportError:
     pdf = None
+
+try:
+    from . import plugin_ocr as ocr
+except ImportError:
+    ocr = None
diff --git a/rows/plugins/plugin_ocr.py b/rows/plugins/plugin_ocr.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+
+# Copyright 2014-2019 Álvaro Justen <https://github.com/turicas/rows/>
+
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU Lesser General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Lesser General Public License for more details.
+
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import unicode_literals
+
+from cached_property import cached_property
+from pytesseract import image_to_boxes
+from PIL import Image
+
+from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
+from rows.plugins.utils import create_table
+from rows.plugins.utils_rect import join_contiguous_rects
+
+
+class TesseractBackend(PDFBackend):
+
+    name = "tesseract"
+
+    def __init__(self, filename_or_fobj, language):
+        self.filename_or_fobj = filename_or_fobj
+        self.language = language
+        super().__init__(self.filename_or_fobj)
+
+    @cached_property
+    def document(self):
+        if hasattr(self.filename_or_fobj, "read"):
+            image = Image.open(self.filename_or_fobj)
+        else:
+            image = self.filename_or_fobj
+
+        return image
+
+    @cached_property
+    def number_of_pages(self):
+        return 1  # TODO: fix
+
+    def extract_text(self, page_numbers=None):
+        return ""  # TODO: image_to_string
+
+    def objects(self, page_numbers=None, starts_after=None, ends_before=None):
+        header = "char left bottom right top page".split()
+        boxes = image_to_boxes(self.document, lang=self.language).splitlines()
+        text_objs = []
+        max_width = 0
+        for box in boxes:
+            row = {}
+            for key, value in zip(header, box.split()):
+                if key != "char":
+                    value = int(value)
+                row[key] = value
+            obj = TextObject(
+                x0=row["left"],
+                y0=row["bottom"],
+                x1=row["right"],
+                y1=row["top"],
+                text=row["char"],
+            )
+            text_objs.append(obj)
+            max_width = max(max_width, row["right"] - row["left"])
+
+        text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
+        #  group contiguous objects before yielding
+        text_objs = join_contiguous_rects(text_objs, tolerance=max_width)
+        yield text_objs
+
+    text_objects = objects
+
+
+def import_from_image(
+    filename_or_fobj,
+    language="eng",
+    algorithm="y-groups",
+    x_threshold=1.0,
+    y_threshold=1.0,
+    *args,
+    **kwargs
+):
+    meta = {"imported_from": "image"}
+    table_rows = pdf_table_lines(
+        filename_or_fobj,
+        None,
+        starts_after=None,
+        ends_before=None,
+        algorithm=algorithm,
+        x_threshold=x_threshold,
+        y_threshold=y_threshold,
+        backend=TesseractBackend,
+        backend_kwargs={"language": language},
+    )
+    return create_table(table_rows, meta=meta, *args, **kwargs)
diff --git a/rows/plugins/plugin_pdf.py b/rows/plugins/plugin_pdf.py
@@ -714,13 +714,17 @@ def pdf_table_lines(
     x_threshold=0.5,
     y_threshold=0.5,
     backend=None,
+    backend_args=None,
+    backend_kwargs=None,
 ):
     backend = backend or default_backend()
 
     # TODO: check if both backends accepts filename or fobj
     Backend = get_backend(backend)
     Algorithm = get_algorithm(algorithm)
-    pdf_doc = Backend(filename_or_fobj)
+    backend_args = backend_args or []
+    backend_kwargs = backend_kwargs or {}
+    pdf_doc = Backend(filename_or_fobj, *backend_args, **backend_kwargs)
 
     pages = pdf_doc.objects(
         page_numbers=page_numbers, starts_after=starts_after, ends_before=ends_before

diff --git a/rows/plugins/utils_rect.py b/rows/plugins/utils_rect.py
@@ -0,0 +1,98 @@
+from copy import copy
+
+
+SIDES = "left top right bottom".split()
+
+
+class Rect:
+    def __init__(self, rect):
+        #rect = {key: value for key, value in rect.items() if key in SIDES}
+        self.__dict__.update(rect)
+
+    def __hash__(self):
+        return hash((self.left, self.top, self.right, self.bottom))
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __setitem__(self, item, value):
+        self.__dict__[item] = value
+
+    def __eq__(self, other):
+        return all(self[side] == other[side] for side in SIDES)
+
+    def __repr__(self):
+        return "<{left}, {top}, {right}, {bottom}>".format(**self.__dict__)
+
+
+def consolidate(new_rect, rect1, rect2):
+    if new_rect is None:
+        new_rect = copy(rect1)
+    for op, side in zip((min, max, max, min), SIDES):
+        new_rect[side] = op(r[side] for r in (new_rect, rect1, rect2))
+
+    return new_rect
+
+
+def mag(x, y):
+    return x ** 2 + y ** 2
+
+
+def find_paired_rects(rects, tolerance):
+
+    rects_by_left = {}
+    for r in rects:
+        rects_by_left.setdefault(r.left, []).append(r)
+
+    left_right_pairs = []
+    paired = {}
+    for rect in rects:
+        mag_alignment = None
+        for offset_x in range(-tolerance, tolerance + 1):
+            if (rect.right + offset_x) not in rects_by_left:
+                continue
+            for aligned_rect in rects_by_left[rect.right + offset_x]:
+                if aligned_rect is rect: continue
+                for offset_y in range(-tolerance, tolerance + 1):
+                    if (rect.top + offset_y) == aligned_rect.top:
+                        new_mag = mag(offset_x, offset_y)
+                        if mag_alignment is None or new_mag < mag_alignment:
+                            paired[rect] = aligned_rect
+                            mag_alignment = new_mag
+
+    return paired
+
+
+def join_contiguous_rects(rect_dicts, tolerance=1):
+    rects = [Rect(rect) for rect in rect_dicts]
+
+    paired = find_paired_rects(rects, tolerance)
+
+    consolidated = []
+    to_remove = set()
+
+    for rect in sorted(rects, key=lambda r:r.left):
+        if rect in to_remove:
+            continue
+        new_rect = None
+        chars = ""
+        while rect in paired:
+            chars += rect.char
+            new_rect = consolidate(new_rect, rect, paired[rect])
+            to_remove.add(rect)
+            rect = paired[rect]
+
+        chars += rect.char
+        if new_rect:
+            new_rect.char = chars
+            to_remove.add(rect)
+            consolidated.append(new_rect)
+
+    result = [
+        r.__dict__ for r in sorted(
+            consolidated + [rect for rect in rects if rect not in to_remove],
+            key= lambda r: (-r.top, r.left)
+        )
+    ]
+    return result
+
diff --git a/tests/tests_plugin_ocr.py b/tests/tests_plugin_ocr.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+
+# Copyright 2014-2019 Álvaro Justen <https://github.com/turicas/rows/>
+
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU Lesser General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU Lesser General Public License for more details.
+
+#    You should have received a copy of the GNU Lesser General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import unicode_literals
+
+import unittest
+
+import rows
+from rows.plugins.utils_rect import join_contiguous_rects
+
+import tests.utils as utils
+
+test_data = [
+    {'char': 'R', 'left': 1282.0, 'bottom': 52.0, 'right': 1284.0, 'top': 63.0, 'page': 0.0},
+    {'char': 'S', 'left': 1284.0, 'bottom': 52.0, 'right': 1295.0, 'top': 63.0, 'page': 0.0},
+    {'char': '2', 'left': 1302.0, 'bottom': 52.0, 'right': 1303.0, 'top': 63.0, 'page': 0.0},
+    {'char': '5', 'left': 1303.0, 'bottom': 52.0, 'right': 1309.0, 'top': 63.0, 'page': 0.0},
+    {'char': '.', 'left': 1312.0, 'bottom': 53.0, 'right': 1317.0, 'top': 63.0, 'page': 0.0},
+    {'char': '0', 'left': 1319.0, 'bottom': 53.0, 'right': 1321.0, 'top': 56.0, 'page': 0.0},
+    {'char': '0', 'left': 1326.0, 'bottom': 53.0, 'right': 1334.0, 'top': 64.0, 'page': 0.0},
+    {'char': '0', 'left': 1334.0, 'bottom': 53.0, 'right': 1338.0, 'top': 64.0, 'page': 0.0},
+    {'char': ',', 'left': 1338.0, 'bottom': 53.0, 'right': 1343.0, 'top': 64.0, 'page': 0.0},
+    {'char': '0', 'left': 1344.0, 'bottom': 51.0, 'right': 1347.0, 'top': 56.0, 'page': 0.0},
+    {'char': '0', 'left': 1352.0, 'bottom': 53.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0},
+]
+
+
+class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase):
+
+    plugin_name = "ocr"
+    file_extension = "png"
+    filename = "tests/data/all-field-types.png"
+
+    def test_imports(self):
+        self.assertIs(rows.import_from_image, rows.plugins.ocr.import_from_image)
+
+    def basic_test(self):
+        table = rows.import_from_image(self.filename)
+        # TODO: assert
+
+
+class TestRectUtils(unittest.TestCase):
+
+    def test_join_contiguous_rects(self):
+        self.assertEquals(
+            join_contiguous_rects(test_data, 10),
+            [{'char': 'RS25.000,00', 'left': 1282.0, 'bottom': 51.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0}]
+        )