From 0f229f05561b87e57106af18055de2efa81a9581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?= Date: Fri, 12 Apr 2019 01:20:35 -0300 Subject: [PATCH 1/4] Add backend args to pdf_table_lines --- rows/plugins/plugin_pdf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rows/plugins/plugin_pdf.py b/rows/plugins/plugin_pdf.py index 80fa8ebc..d8561841 100644 --- a/rows/plugins/plugin_pdf.py +++ b/rows/plugins/plugin_pdf.py @@ -714,13 +714,17 @@ def pdf_table_lines( x_threshold=0.5, y_threshold=0.5, backend=None, + backend_args=None, + backend_kwargs=None, ): backend = backend or default_backend() # TODO: check if both backends accepts filename or fobj Backend = get_backend(backend) Algorithm = get_algorithm(algorithm) - pdf_doc = Backend(filename_or_fobj) + backend_args = backend_args or [] + backend_kwargs = backend_kwargs or {} + pdf_doc = Backend(filename_or_fobj, *backend_args, **backend_kwargs) pages = pdf_doc.objects( page_numbers=page_numbers, starts_after=starts_after, ends_before=ends_before From c0cdd2e43e80e2146c92cd142d94fd6d2ea27849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?= Date: Fri, 12 Apr 2019 01:24:35 -0300 Subject: [PATCH 2/4] Add draft OCR implementation using pytesseract --- rows/__init__.py | 3 ++ rows/plugins/__init__.py | 5 ++ rows/plugins/plugin_ocr.py | 100 +++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+) create mode 100644 rows/plugins/plugin_ocr.py diff --git a/rows/__init__.py b/rows/__init__.py index 90211297..efc0505a 100644 --- a/rows/__init__.py +++ b/rows/__init__.py @@ -74,5 +74,8 @@ if plugins.pdf: import_from_pdf = plugins.pdf.import_from_pdf +if plugins.ocr: + import_from_image = plugins.ocr.import_from_image + __version__ = "0.4.2dev0" diff --git a/rows/plugins/__init__.py b/rows/plugins/__init__.py index 9bb2933f..838edf97 100644 --- a/rows/plugins/__init__.py +++ b/rows/plugins/__init__.py @@ -64,3 +64,8 @@ from . import plugin_pdf as pdf except ImportError: pdf = None + +try: + from . import plugin_ocr as ocr +except ImportError: + ocr = None diff --git a/rows/plugins/plugin_ocr.py b/rows/plugins/plugin_ocr.py new file mode 100644 index 00000000..89501427 --- /dev/null +++ b/rows/plugins/plugin_ocr.py @@ -0,0 +1,100 @@ +# coding: utf-8 + +# Copyright 2014-2019 Álvaro Justen + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . + +from __future__ import unicode_literals + +from cached_property import cached_property +from pytesseract import image_to_boxes +from PIL import Image + +from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines +from rows.plugins.utils import create_table + + +class TesseractBackend(PDFBackend): + + name = "tesseract" + + def __init__(self, filename_or_fobj, language): + self.filename_or_fobj = filename_or_fobj + self.language = language + super().__init__(self.filename_or_fobj) + + @cached_property + def document(self): + if hasattr(self.filename_or_fobj, "read"): + image = Image.open(self.filename_or_fobj) + else: + image = self.filename_or_fobj + + return image + + @cached_property + def number_of_pages(self): + return 1 # TODO: fix + + def extract_text(self, page_numbers=None): + return "" # TODO: image_to_string + + def objects(self, page_numbers=None, starts_after=None, ends_before=None): + header = "char left bottom right top page".split() + boxes = image_to_boxes(self.document, lang=self.language).splitlines() + text_objs = [] + for box in boxes: + row = {} + for key, value in zip(header, box.split()): + if key != "char": + value = int(value) + row[key] = value + obj = TextObject( + x0=row["left"], + y0=row["bottom"], + x1=row["right"], + y1=row["top"], + text=row["char"], + ) + text_objs.append(obj) + + text_objs.sort(key=lambda obj: (obj.y0, obj.x0)) + # TODO: group contiguous objects before yielding + yield text_objs + + text_objects = objects + + +def import_from_image( + filename_or_fobj, + language="eng", + algorithm="y-groups", + x_threshold=1.0, + y_threshold=1.0, + *args, + **kwargs +): + meta = {"imported_from": "image"} + table_rows = pdf_table_lines( + filename_or_fobj, + None, + starts_after=None, + ends_before=None, + algorithm=algorithm, + x_threshold=x_threshold, + y_threshold=y_threshold, + backend=TesseractBackend, + backend_kwargs={"language": language}, + ) + return create_table(table_rows, meta=meta, *args, **kwargs) From 0348e8fd92adbcdb47ab63668c8bfb6901944dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20=28=40turicas=29?= Date: Fri, 12 Apr 2019 01:24:57 -0300 Subject: [PATCH 3/4] Add OCR tests draft --- tests/tests_plugin_ocr.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/tests_plugin_ocr.py diff --git a/tests/tests_plugin_ocr.py b/tests/tests_plugin_ocr.py new file mode 100644 index 00000000..56ea0cd2 --- /dev/null +++ b/tests/tests_plugin_ocr.py @@ -0,0 +1,38 @@ +# coding: utf-8 + +# Copyright 2014-2019 Álvaro Justen + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . + +from __future__ import unicode_literals + +import unittest + +import rows + +import tests.utils as utils + + +class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase): + + plugin_name = "ocr" + file_extension = "png" + filename = "tests/data/all-field-types.png" + + def test_imports(self): + self.assertIs(rows.import_from_image, rows.plugins.ocr.import_from_image) + + def basic_test(self): + table = rows.import_from_image(self.filename) + # TODO: assert From 3816f159b74d797ccc17a9f06c2bd580e92740da Mon Sep 17 00:00:00 2001 From: Joao S O Bueno Date: Tue, 16 Apr 2019 03:23:35 -0300 Subject: [PATCH 4/4] Adds code to merge contiguous rectangular areas --- rows/plugins/plugin_ocr.py | 6 ++- rows/plugins/utils_rect.py | 98 ++++++++++++++++++++++++++++++++++++++ tests/tests_plugin_ocr.py | 24 ++++++++++ 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 rows/plugins/utils_rect.py diff --git a/rows/plugins/plugin_ocr.py b/rows/plugins/plugin_ocr.py index 89501427..cd1357f5 100644 --- a/rows/plugins/plugin_ocr.py +++ b/rows/plugins/plugin_ocr.py @@ -23,6 +23,7 @@ from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines from rows.plugins.utils import create_table +from rows.plugins.utils_rect import join_contiguous_rects class TesseractBackend(PDFBackend): @@ -54,6 +55,7 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None): header = "char left bottom right top page".split() boxes = image_to_boxes(self.document, lang=self.language).splitlines() text_objs = [] + max_width = 0 for box in boxes: row = {} for key, value in zip(header, box.split()): @@ -68,9 +70,11 @@ def objects(self, page_numbers=None, starts_after=None, ends_before=None): text=row["char"], ) text_objs.append(obj) + max_width = max(max_width, row["right"] - row["left"]) text_objs.sort(key=lambda obj: (obj.y0, obj.x0)) - # TODO: group contiguous objects before yielding + # group contiguous objects before yielding + text_objs = join_contiguous_rects(text_objs, tolerance=max_width) yield text_objs text_objects = objects diff --git a/rows/plugins/utils_rect.py b/rows/plugins/utils_rect.py new file mode 100644 index 00000000..3a14a13f --- /dev/null +++ b/rows/plugins/utils_rect.py @@ -0,0 +1,98 @@ +from copy import copy + + +SIDES = "left top right bottom".split() + + +class Rect: + def __init__(self, rect): + #rect = {key: value for key, value in rect.items() if key in SIDES} + self.__dict__.update(rect) + + def __hash__(self): + return hash((self.left, self.top, self.right, self.bottom)) + + def __getitem__(self, item): + return self.__dict__[item] + + def __setitem__(self, item, value): + self.__dict__[item] = value + + def __eq__(self, other): + return all(self[side] == other[side] for side in SIDES) + + def __repr__(self): + return "<{left}, {top}, {right}, {bottom}>".format(**self.__dict__) + + +def consolidate(new_rect, rect1, rect2): + if new_rect is None: + new_rect = copy(rect1) + for op, side in zip((min, max, max, min), SIDES): + new_rect[side] = op(r[side] for r in (new_rect, rect1, rect2)) + + return new_rect + + +def mag(x, y): + return x ** 2 + y ** 2 + + +def find_paired_rects(rects, tolerance): + + rects_by_left = {} + for r in rects: + rects_by_left.setdefault(r.left, []).append(r) + + left_right_pairs = [] + paired = {} + for rect in rects: + mag_alignment = None + for offset_x in range(-tolerance, tolerance + 1): + if (rect.right + offset_x) not in rects_by_left: + continue + for aligned_rect in rects_by_left[rect.right + offset_x]: + if aligned_rect is rect: continue + for offset_y in range(-tolerance, tolerance + 1): + if (rect.top + offset_y) == aligned_rect.top: + new_mag = mag(offset_x, offset_y) + if mag_alignment is None or new_mag < mag_alignment: + paired[rect] = aligned_rect + mag_alignment = new_mag + + return paired + + +def join_contiguous_rects(rect_dicts, tolerance=1): + rects = [Rect(rect) for rect in rect_dicts] + + paired = find_paired_rects(rects, tolerance) + + consolidated = [] + to_remove = set() + + for rect in sorted(rects, key=lambda r:r.left): + if rect in to_remove: + continue + new_rect = None + chars = "" + while rect in paired: + chars += rect.char + new_rect = consolidate(new_rect, rect, paired[rect]) + to_remove.add(rect) + rect = paired[rect] + + chars += rect.char + if new_rect: + new_rect.char = chars + to_remove.add(rect) + consolidated.append(new_rect) + + result = [ + r.__dict__ for r in sorted( + consolidated + [rect for rect in rects if rect not in to_remove], + key= lambda r: (-r.top, r.left) + ) + ] + return result + diff --git a/tests/tests_plugin_ocr.py b/tests/tests_plugin_ocr.py index 56ea0cd2..94e84f98 100644 --- a/tests/tests_plugin_ocr.py +++ b/tests/tests_plugin_ocr.py @@ -20,9 +20,24 @@ import unittest import rows +from rows.plugins.utils_rect import join_contiguous_rects import tests.utils as utils +test_data = [ + {'char': 'R', 'left': 1282.0, 'bottom': 52.0, 'right': 1284.0, 'top': 63.0, 'page': 0.0}, + {'char': 'S', 'left': 1284.0, 'bottom': 52.0, 'right': 1295.0, 'top': 63.0, 'page': 0.0}, + {'char': '2', 'left': 1302.0, 'bottom': 52.0, 'right': 1303.0, 'top': 63.0, 'page': 0.0}, + {'char': '5', 'left': 1303.0, 'bottom': 52.0, 'right': 1309.0, 'top': 63.0, 'page': 0.0}, + {'char': '.', 'left': 1312.0, 'bottom': 53.0, 'right': 1317.0, 'top': 63.0, 'page': 0.0}, + {'char': '0', 'left': 1319.0, 'bottom': 53.0, 'right': 1321.0, 'top': 56.0, 'page': 0.0}, + {'char': '0', 'left': 1326.0, 'bottom': 53.0, 'right': 1334.0, 'top': 64.0, 'page': 0.0}, + {'char': '0', 'left': 1334.0, 'bottom': 53.0, 'right': 1338.0, 'top': 64.0, 'page': 0.0}, + {'char': ',', 'left': 1338.0, 'bottom': 53.0, 'right': 1343.0, 'top': 64.0, 'page': 0.0}, + {'char': '0', 'left': 1344.0, 'bottom': 51.0, 'right': 1347.0, 'top': 56.0, 'page': 0.0}, + {'char': '0', 'left': 1352.0, 'bottom': 53.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0}, +] + class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase): @@ -36,3 +51,12 @@ def test_imports(self): def basic_test(self): table = rows.import_from_image(self.filename) # TODO: assert + + +class TestRectUtils(unittest.TestCase): + + def test_join_contiguous_rects(self): + self.assertEquals( + join_contiguous_rects(test_data, 10), + [{'char': 'RS25.000,00', 'left': 1282.0, 'bottom': 51.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0}] + )