Skip to content

fix(document): save document images page by page #226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -2911,7 +2911,10 @@ def _with_embedded_pictures(self) -> "DoclingDocument":
return result

def _with_pictures_refs(
self, image_dir: Path, reference_path: Optional[Path] = None
self,
image_dir: Path,
page_no: Optional[int],
reference_path: Optional[Path] = None,
) -> "DoclingDocument":
"""Document with images as refs.

Expand All @@ -2924,7 +2927,7 @@ def _with_pictures_refs(
image_dir.mkdir(parents=True, exist_ok=True)

if image_dir.is_dir():
for item, level in result.iterate_items(with_groups=False):
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
if isinstance(item, PictureItem):

if (
Expand Down Expand Up @@ -2999,7 +3002,7 @@ def save_as_json(
os.makedirs(artifacts_dir, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, reference_path=reference_path
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
)

out = new_doc.export_to_dict()
Expand Down Expand Up @@ -3038,7 +3041,7 @@ def save_as_yaml(
os.makedirs(artifacts_dir, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, reference_path=reference_path
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
)

out = new_doc.export_to_dict()
Expand Down Expand Up @@ -3099,7 +3102,7 @@ def save_as_markdown(
os.makedirs(artifacts_dir, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, reference_path=reference_path
artifacts_dir, image_mode, page_no, reference_path=reference_path
)

md_out = new_doc.export_to_markdown(
Expand Down Expand Up @@ -3261,7 +3264,7 @@ def save_as_html(
os.makedirs(artifacts_dir, exist_ok=True)

new_doc = self._make_copy_with_refmode(
artifacts_dir, image_mode, reference_path=reference_path
artifacts_dir, image_mode, page_no, reference_path=reference_path
)

html_out = new_doc.export_to_html(
Expand Down Expand Up @@ -3298,14 +3301,15 @@ def _make_copy_with_refmode(
self,
artifacts_dir: Path,
image_mode: ImageRefMode,
page_no: Optional[int],
reference_path: Optional[Path] = None,
):
new_doc = None
if image_mode == ImageRefMode.PLACEHOLDER:
new_doc = self
elif image_mode == ImageRefMode.REFERENCED:
new_doc = self._with_pictures_refs(
image_dir=artifacts_dir, reference_path=reference_path
image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
)
elif image_mode == ImageRefMode.EMBEDDED:
new_doc = self._with_embedded_pictures()
Expand Down
44 changes: 42 additions & 2 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,12 +1113,51 @@ def test_save_pictures():

doc: DoclingDocument = _construct_doc()

new_doc = doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"))
new_doc = doc._with_pictures_refs(
image_dir=Path("./test/data/constructed_images/"), page_no=None
)

img_paths = new_doc._list_images_on_disk()
assert len(img_paths) == 1, "len(img_paths)!=1"


def test_save_pictures_with_page():
# Given
doc = DoclingDocument(name="Dummy")

doc.add_page(page_no=1, size=Size(width=2000, height=4000), image=None)
doc.add_page(
page_no=2,
size=Size(width=2000, height=4000),
)
image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
doc.add_picture(
image=ImageRef.from_pil(image=image, dpi=72),
prov=ProvenanceItem(
page_no=2,
bbox=BoundingBox(
b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT
),
charspan=(1, 2),
),
)

# When
with_ref = doc._with_pictures_refs(
image_dir=Path("./test/data/constructed_images/"), page_no=1
)
# Then
n_images = len(with_ref._list_images_on_disk())
assert n_images == 0
# When
with_ref = with_ref._with_pictures_refs(
image_dir=Path("./test/data/constructed_images/"), page_no=2
)
n_images = len(with_ref._list_images_on_disk())
# Then
assert n_images == 1


def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):

for p in paths:
Expand Down Expand Up @@ -1170,7 +1209,8 @@ def test_save_to_disk():
image_dir = Path("./test/data/doc/constructed_images/")

doc_with_references = doc._with_pictures_refs(
image_dir=image_dir # Path("./test/data/constructed_images/")
image_dir=image_dir, # Path("./test/data/constructed_images/")
page_no=None,
)

# paths will be different on different machines, so needs to be kept!
Expand Down