From 0053402978c08a25c95e0a20ab0a4cfa89518e3f Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Tue, 10 Dec 2024 12:33:55 -0500
Subject: [PATCH 1/2] Correct return value for image load error in extract line
 & line path

---
 kraken/lib/arrow_dataset.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kraken/lib/arrow_dataset.py b/kraken/lib/arrow_dataset.py
index 243b77eb..864a3b52 100644
--- a/kraken/lib/arrow_dataset.py
+++ b/kraken/lib/arrow_dataset.py
@@ -50,8 +50,9 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo
         im = Image.open(xml_record.imagename)
         if is_bitonal(im):
             im = im.convert('1')
-    except (OSError, FileNotFoundError, UnidentifiedImageError):
-        return lines, None, None
+    except (OSError, FileNotFoundError, UnidentifiedImageError) as err:
+        logger.warning(f'Error loading image {xml_record.imagename}: {err}')
+        return lines, None
     for idx, rec in enumerate(xml_record.lines):
         seg = Segmentation(text_direction='horizontal-lr',
                            imagename=xml_record.imagename,
@@ -79,10 +80,11 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo
 def _extract_path_line(xml_record, skip_empty_lines: bool = True):
     try:
         im = Image.open(xml_record['image'])
-    except (FileNotFoundError, UnidentifiedImageError):
-        return [], None, None
+    except (FileNotFoundError, UnidentifiedImageError) as err:
+        logger.warning(f'Error loading image {xml_record.imagename}: {err}')
+        return [], None
     if not xml_record['lines'][0]['text'] and skip_empty_lines:
-        return [], None, None
+        return [], None
     if is_bitonal(im):
         im = im.convert('1')
     fp = io.BytesIO()

From d3565877577484424685569b1ac64ff7afb00819 Mon Sep 17 00:00:00 2001
From: rlskoeser <rebecca.s.koeser@princeton.edu>
Date: Tue, 10 Dec 2024 12:57:24 -0500
Subject: [PATCH 2/2] Add a test for image error handling

---
 tests/test_arrow_dataset.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 31c3fb8b..ee2b7530 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -6,7 +6,7 @@
 import pyarrow as pa
 
 from pathlib import Path
-from pytest import raises
+from pytest import raises, fixture
 
 import kraken
 from kraken.lib import xml
@@ -89,3 +89,26 @@ def test_build_empty_dataset(self):
                                  format_type='xml',
                                  skip_empty_lines=False)
             _validate_ds(self, tmp_file.name, 5, 1, 'kraken_recognition_baseline')
+
+    @fixture(autouse=True)
+    def caplog_fixture(self, caplog):
+        # make pytest caplog fixture available
+        self.caplog = caplog
+
+    def test_build_image_error(self):
+        """
+        Test that image load errors are handled.
+        """
+        # change resource path so it will not resolve
+        bad_box_lines = [path.with_name(f"bogus_{path.stem}") for path in self.box_lines]
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            build_binary_dataset(files=bad_box_lines,
+                                 output_file=tmp_file.name,
+                                 format_type='xml')
+            # expect zero resulting lines due to image load error
+            _validate_ds(self, tmp_file.name, 0, 0, 'kraken_recognition_baseline')
+	# expect one warning log message; should include the file image filename
+        assert len(self.caplog.records) == 1
+        log_record = self.caplog.records[0]
+        assert log_record.levelname == "WARNING"
+        assert f"Invalid input file {bad_box_lines[0]}" in log_record.message