Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds code to merge contiguous rectangular areas #324

Open
wants to merge 4 commits into
base: feature/plugin-ocr
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions rows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,8 @@
if plugins.pdf:
import_from_pdf = plugins.pdf.import_from_pdf

if plugins.ocr:
import_from_image = plugins.ocr.import_from_image


__version__ = "0.4.2dev0"
5 changes: 5 additions & 0 deletions rows/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,8 @@
from . import plugin_pdf as pdf
except ImportError:
pdf = None

try:
from . import plugin_ocr as ocr
except ImportError:
ocr = None
104 changes: 104 additions & 0 deletions rows/plugins/plugin_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# coding: utf-8

# Copyright 2014-2019 Álvaro Justen <https://github.com/turicas/rows/>

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

from cached_property import cached_property
from pytesseract import image_to_boxes
from PIL import Image

from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
from rows.plugins.utils import create_table
from rows.plugins.utils_rect import join_contiguous_rects


class TesseractBackend(PDFBackend):

name = "tesseract"

def __init__(self, filename_or_fobj, language):
self.filename_or_fobj = filename_or_fobj
self.language = language
super().__init__(self.filename_or_fobj)

@cached_property
def document(self):
if hasattr(self.filename_or_fobj, "read"):
image = Image.open(self.filename_or_fobj)
else:
image = self.filename_or_fobj

return image

@cached_property
def number_of_pages(self):
return 1 # TODO: fix

def extract_text(self, page_numbers=None):
return "" # TODO: image_to_string

def objects(self, page_numbers=None, starts_after=None, ends_before=None):
header = "char left bottom right top page".split()
boxes = image_to_boxes(self.document, lang=self.language).splitlines()
text_objs = []
max_width = 0
for box in boxes:
row = {}
for key, value in zip(header, box.split()):
if key != "char":
value = int(value)
row[key] = value
obj = TextObject(
x0=row["left"],
y0=row["bottom"],
x1=row["right"],
y1=row["top"],
text=row["char"],
)
text_objs.append(obj)
max_width = max(max_width, row["right"] - row["left"])

text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
# group contiguous objects before yielding
text_objs = join_contiguous_rects(text_objs, tolerance=max_width)
yield text_objs

text_objects = objects


def import_from_image(
filename_or_fobj,
language="eng",
algorithm="y-groups",
x_threshold=1.0,
y_threshold=1.0,
*args,
**kwargs
):
meta = {"imported_from": "image"}
table_rows = pdf_table_lines(
filename_or_fobj,
None,
starts_after=None,
ends_before=None,
algorithm=algorithm,
x_threshold=x_threshold,
y_threshold=y_threshold,
backend=TesseractBackend,
backend_kwargs={"language": language},
)
return create_table(table_rows, meta=meta, *args, **kwargs)
6 changes: 5 additions & 1 deletion rows/plugins/plugin_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,13 +714,17 @@ def pdf_table_lines(
x_threshold=0.5,
y_threshold=0.5,
backend=None,
backend_args=None,
backend_kwargs=None,
):
backend = backend or default_backend()

# TODO: check if both backends accepts filename or fobj
Backend = get_backend(backend)
Algorithm = get_algorithm(algorithm)
pdf_doc = Backend(filename_or_fobj)
backend_args = backend_args or []
backend_kwargs = backend_kwargs or {}
pdf_doc = Backend(filename_or_fobj, *backend_args, **backend_kwargs)

pages = pdf_doc.objects(
page_numbers=page_numbers, starts_after=starts_after, ends_before=ends_before
Expand Down
98 changes: 98 additions & 0 deletions rows/plugins/utils_rect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from copy import copy


SIDES = "left top right bottom".split()


class Rect:
def __init__(self, rect):
#rect = {key: value for key, value in rect.items() if key in SIDES}
self.__dict__.update(rect)

def __hash__(self):
return hash((self.left, self.top, self.right, self.bottom))

def __getitem__(self, item):
return self.__dict__[item]

def __setitem__(self, item, value):
self.__dict__[item] = value

def __eq__(self, other):
return all(self[side] == other[side] for side in SIDES)

def __repr__(self):
return "<{left}, {top}, {right}, {bottom}>".format(**self.__dict__)


def consolidate(new_rect, rect1, rect2):
if new_rect is None:
new_rect = copy(rect1)
for op, side in zip((min, max, max, min), SIDES):
new_rect[side] = op(r[side] for r in (new_rect, rect1, rect2))

return new_rect


def mag(x, y):
return x ** 2 + y ** 2


def find_paired_rects(rects, tolerance):

rects_by_left = {}
for r in rects:
rects_by_left.setdefault(r.left, []).append(r)

left_right_pairs = []
paired = {}
for rect in rects:
mag_alignment = None
for offset_x in range(-tolerance, tolerance + 1):
if (rect.right + offset_x) not in rects_by_left:
continue
for aligned_rect in rects_by_left[rect.right + offset_x]:
if aligned_rect is rect: continue
for offset_y in range(-tolerance, tolerance + 1):
if (rect.top + offset_y) == aligned_rect.top:
new_mag = mag(offset_x, offset_y)
if mag_alignment is None or new_mag < mag_alignment:
paired[rect] = aligned_rect
mag_alignment = new_mag

return paired


def join_contiguous_rects(rect_dicts, tolerance=1):
rects = [Rect(rect) for rect in rect_dicts]

paired = find_paired_rects(rects, tolerance)

consolidated = []
to_remove = set()

for rect in sorted(rects, key=lambda r:r.left):
if rect in to_remove:
continue
new_rect = None
chars = ""
while rect in paired:
chars += rect.char
new_rect = consolidate(new_rect, rect, paired[rect])
to_remove.add(rect)
rect = paired[rect]

chars += rect.char
if new_rect:
new_rect.char = chars
to_remove.add(rect)
consolidated.append(new_rect)

result = [
r.__dict__ for r in sorted(
consolidated + [rect for rect in rects if rect not in to_remove],
key= lambda r: (-r.top, r.left)
)
]
return result

62 changes: 62 additions & 0 deletions tests/tests_plugin_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# coding: utf-8

# Copyright 2014-2019 Álvaro Justen <https://github.com/turicas/rows/>

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import unittest

import rows
from rows.plugins.utils_rect import join_contiguous_rects

import tests.utils as utils

test_data = [
{'char': 'R', 'left': 1282.0, 'bottom': 52.0, 'right': 1284.0, 'top': 63.0, 'page': 0.0},
{'char': 'S', 'left': 1284.0, 'bottom': 52.0, 'right': 1295.0, 'top': 63.0, 'page': 0.0},
{'char': '2', 'left': 1302.0, 'bottom': 52.0, 'right': 1303.0, 'top': 63.0, 'page': 0.0},
{'char': '5', 'left': 1303.0, 'bottom': 52.0, 'right': 1309.0, 'top': 63.0, 'page': 0.0},
{'char': '.', 'left': 1312.0, 'bottom': 53.0, 'right': 1317.0, 'top': 63.0, 'page': 0.0},
{'char': '0', 'left': 1319.0, 'bottom': 53.0, 'right': 1321.0, 'top': 56.0, 'page': 0.0},
{'char': '0', 'left': 1326.0, 'bottom': 53.0, 'right': 1334.0, 'top': 64.0, 'page': 0.0},
{'char': '0', 'left': 1334.0, 'bottom': 53.0, 'right': 1338.0, 'top': 64.0, 'page': 0.0},
{'char': ',', 'left': 1338.0, 'bottom': 53.0, 'right': 1343.0, 'top': 64.0, 'page': 0.0},
{'char': '0', 'left': 1344.0, 'bottom': 51.0, 'right': 1347.0, 'top': 56.0, 'page': 0.0},
{'char': '0', 'left': 1352.0, 'bottom': 53.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0},
]


class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase):

plugin_name = "ocr"
file_extension = "png"
filename = "tests/data/all-field-types.png"

def test_imports(self):
self.assertIs(rows.import_from_image, rows.plugins.ocr.import_from_image)

def basic_test(self):
table = rows.import_from_image(self.filename)
# TODO: assert


class TestRectUtils(unittest.TestCase):

def test_join_contiguous_rects(self):
self.assertEquals(
join_contiguous_rects(test_data, 10),
[{'char': 'RS25.000,00', 'left': 1282.0, 'bottom': 51.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0}]
)