From 9022b61dd5e87ea0a93275a3a6caba2634a42d27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cle=CC=81ment=20Doumouro?= Date: Tue, 1 Apr 2025 15:26:23 +0200 Subject: [PATCH] fix(document): save document images page by page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Clément Doumouro --- docling_core/types/doc/document.py | 18 +++++++----- test/test_docling_doc.py | 44 ++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 06af6a13..1a340555 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2911,7 +2911,10 @@ def _with_embedded_pictures(self) -> "DoclingDocument": return result def _with_pictures_refs( - self, image_dir: Path, reference_path: Optional[Path] = None + self, + image_dir: Path, + page_no: Optional[int], + reference_path: Optional[Path] = None, ) -> "DoclingDocument": """Document with images as refs. @@ -2924,7 +2927,7 @@ def _with_pictures_refs( image_dir.mkdir(parents=True, exist_ok=True) if image_dir.is_dir(): - for item, level in result.iterate_items(with_groups=False): + for item, level in result.iterate_items(page_no=page_no, with_groups=False): if isinstance(item, PictureItem): if ( @@ -2999,7 +3002,7 @@ def save_as_json( os.makedirs(artifacts_dir, exist_ok=True) new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, reference_path=reference_path + artifacts_dir, image_mode, page_no=None, reference_path=reference_path ) out = new_doc.export_to_dict() @@ -3038,7 +3041,7 @@ def save_as_yaml( os.makedirs(artifacts_dir, exist_ok=True) new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, reference_path=reference_path + artifacts_dir, image_mode, page_no=None, reference_path=reference_path ) out = new_doc.export_to_dict() @@ -3099,7 +3102,7 @@ def save_as_markdown( os.makedirs(artifacts_dir, exist_ok=True) new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, reference_path=reference_path + artifacts_dir, image_mode, page_no, reference_path=reference_path ) md_out = new_doc.export_to_markdown( @@ -3261,7 +3264,7 @@ def save_as_html( os.makedirs(artifacts_dir, exist_ok=True) new_doc = self._make_copy_with_refmode( - artifacts_dir, image_mode, reference_path=reference_path + artifacts_dir, image_mode, page_no, reference_path=reference_path ) html_out = new_doc.export_to_html( @@ -3298,6 +3301,7 @@ def _make_copy_with_refmode( self, artifacts_dir: Path, image_mode: ImageRefMode, + page_no: Optional[int], reference_path: Optional[Path] = None, ): new_doc = None @@ -3305,7 +3309,7 @@ def _make_copy_with_refmode( new_doc = self elif image_mode == ImageRefMode.REFERENCED: new_doc = self._with_pictures_refs( - image_dir=artifacts_dir, reference_path=reference_path + image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path ) elif image_mode == ImageRefMode.EMBEDDED: new_doc = self._with_embedded_pictures() diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index f29f19ed..47093895 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -1113,12 +1113,51 @@ def test_save_pictures(): doc: DoclingDocument = _construct_doc() - new_doc = doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/")) + new_doc = doc._with_pictures_refs( + image_dir=Path("./test/data/constructed_images/"), page_no=None + ) img_paths = new_doc._list_images_on_disk() assert len(img_paths) == 1, "len(img_paths)!=1" +def test_save_pictures_with_page(): + # Given + doc = DoclingDocument(name="Dummy") + + doc.add_page(page_no=1, size=Size(width=2000, height=4000), image=None) + doc.add_page( + page_no=2, + size=Size(width=2000, height=4000), + ) + image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0)) + doc.add_picture( + image=ImageRef.from_pil(image=image, dpi=72), + prov=ProvenanceItem( + page_no=2, + bbox=BoundingBox( + b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT + ), + charspan=(1, 2), + ), + ) + + # When + with_ref = doc._with_pictures_refs( + image_dir=Path("./test/data/constructed_images/"), page_no=1 + ) + # Then + n_images = len(with_ref._list_images_on_disk()) + assert n_images == 0 + # When + with_ref = with_ref._with_pictures_refs( + image_dir=Path("./test/data/constructed_images/"), page_no=2 + ) + n_images = len(with_ref._list_images_on_disk()) + # Then + assert n_images == 1 + + def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]): for p in paths: @@ -1170,7 +1209,8 @@ def test_save_to_disk(): image_dir = Path("./test/data/doc/constructed_images/") doc_with_references = doc._with_pictures_refs( - image_dir=image_dir # Path("./test/data/constructed_images/") + image_dir=image_dir, # Path("./test/data/constructed_images/") + page_no=None, ) # paths will be different on different machines, so needs to be kept!