From 0053402978c08a25c95e0a20ab0a4cfa89518e3f Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 10 Dec 2024 12:33:55 -0500 Subject: [PATCH 1/2] Correct return value for image load error in extract line & line path --- kraken/lib/arrow_dataset.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kraken/lib/arrow_dataset.py b/kraken/lib/arrow_dataset.py index 243b77eb..864a3b52 100644 --- a/kraken/lib/arrow_dataset.py +++ b/kraken/lib/arrow_dataset.py @@ -50,8 +50,9 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo im = Image.open(xml_record.imagename) if is_bitonal(im): im = im.convert('1') - except (OSError, FileNotFoundError, UnidentifiedImageError): - return lines, None, None + except (OSError, FileNotFoundError, UnidentifiedImageError) as err: + logger.warning(f'Error loading image {xml_record.imagename}: {err}') + return lines, None for idx, rec in enumerate(xml_record.lines): seg = Segmentation(text_direction='horizontal-lr', imagename=xml_record.imagename, @@ -79,10 +80,11 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo def _extract_path_line(xml_record, skip_empty_lines: bool = True): try: im = Image.open(xml_record['image']) - except (FileNotFoundError, UnidentifiedImageError): - return [], None, None + except (FileNotFoundError, UnidentifiedImageError) as err: + logger.warning(f'Error loading image {xml_record.imagename}: {err}') + return [], None if not xml_record['lines'][0]['text'] and skip_empty_lines: - return [], None, None + return [], None if is_bitonal(im): im = im.convert('1') fp = io.BytesIO() From d3565877577484424685569b1ac64ff7afb00819 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Tue, 10 Dec 2024 12:57:24 -0500 Subject: [PATCH 2/2] Add a test for image error handling --- tests/test_arrow_dataset.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 31c3fb8b..ee2b7530 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -6,7 +6,7 @@ import pyarrow as pa from pathlib import Path -from pytest import raises +from pytest import raises, fixture import kraken from kraken.lib import xml @@ -89,3 +89,26 @@ def test_build_empty_dataset(self): format_type='xml', skip_empty_lines=False) _validate_ds(self, tmp_file.name, 5, 1, 'kraken_recognition_baseline') + + @fixture(autouse=True) + def caplog_fixture(self, caplog): + # make pytest caplog fixture available + self.caplog = caplog + + def test_build_image_error(self): + """ + Test that image load errors are handled. + """ + # change resource path so it will not resolve + bad_box_lines = [path.with_name(f"bogus_{path.stem}") for path in self.box_lines] + with tempfile.NamedTemporaryFile() as tmp_file: + build_binary_dataset(files=bad_box_lines, + output_file=tmp_file.name, + format_type='xml') + # expect zero resulting lines due to image load error + _validate_ds(self, tmp_file.name, 0, 0, 'kraken_recognition_baseline') + # expect one warning log message; should include the file image filename + assert len(self.caplog.records) == 1 + log_record = self.caplog.records[0] + assert log_record.levelname == "WARNING" + assert f"Invalid input file {bad_box_lines[0]}" in log_record.message