Skip to content

Commit

Permalink
docling-core updates
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 30, 2024
1 parent 8e7a28d commit 13350bf
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 36 deletions.
40 changes: 22 additions & 18 deletions deepsearch_glm/utils/doc_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import re
from pathlib import Path
from typing import List

import pandas as pd
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
from docling_core.types.experimental.document import DoclingDocument, FileInfo, BaseFigureData, BaseTableData, \
TableCell, ProvenanceItem, PageItem

from docling_core.types.experimental.labels import DocItemLabel
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size, DoclingDocument, BasePictureData, BaseTableData, TableCell, ProvenanceItem, PageItem, DescriptionItem, DocItemLabel, DocumentOrigin


def resolve_item(paths, obj):
Expand Down Expand Up @@ -58,8 +55,16 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
return unique_objects

def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
doc: DoclingDocument = DoclingDocument(description={},
file_info=FileInfo(filename=doc_glm["file-info"]["filename"], document_hash=doc_glm["file-info"]["document-hash"]))
origin = DocumentOrigin(
mimetype = "application/pdf",
filename = doc_glm["file-info"]["filename"],
binary_hash=doc_glm["file-info"]["document-hash"]
)
doc_name = Path(origin.filename).stem

doc: DoclingDocument = DoclingDocument(name=doc_name,
description=DescriptionItem(),
origin=origin)

if "properties" in doc_glm:
props = pd.DataFrame(
Expand Down Expand Up @@ -117,7 +122,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:

prov = ProvenanceItem(page_no=nelem["page"], charspan=tuple(nelem["span"]), bbox=BoundingBox.from_tuple(nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

caption_obj = doc.add_paragraph(label=DocItemLabel.CAPTION, text=text, prov=prov)
caption_obj = doc.add_text(label=DocItemLabel.CAPTION, text=text, prov=prov)
caption_refs.append(caption_obj.get_ref())

figure = {
Expand All @@ -139,8 +144,8 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
prov = ProvenanceItem(page_no=pelem["page"], charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

fig = doc.add_figure(data=BaseFigureData(), prov=prov)
fig.captions.extend(caption_refs)
pic = doc.add_picture(data=BasePictureData(), prov=prov)
pic.captions.extend(caption_refs)

elif ptype == "table":
text = ""
Expand All @@ -166,7 +171,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
prov = ProvenanceItem(page_no=pelem["page"], charspan=nelem["span"],
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

caption_obj = doc.add_paragraph(label=DocItemLabel.CAPTION, text=text, prov=prov)
caption_obj = doc.add_text(label=DocItemLabel.CAPTION, text=text, prov=prov)
caption_refs.append(caption_obj.get_ref())

table_cells_glm = _flatten_table_grid(obj["data"])
Expand Down Expand Up @@ -208,23 +213,22 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
prov = ProvenanceItem(page_no=pelem["page"], charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

doc.add_paragraph(label=DocItemLabel(name_label), text=text, prov=prov)
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)

else:
pass
# This branch should not be reachable.

page_to_hash = {
item["page"]: item["hash"]
for item in doc_glm["file-info"]["page-hashes"]
}
#page_to_hash = {
# item["page"]: item["hash"]
# for item in doc_glm["file-info"]["page-hashes"]
#}

for page_dim in doc_glm["page-dimensions"]:
page_no = int(page_dim["page"])
size = Size(width=page_dim["width"], height=page_dim["height"])
hash = page_to_hash[page_no]

doc.add_page(page_no=page_no, size=size, hash=hash)
doc.add_page(page_no=page_no, size=size)

return doc

Expand Down
38 changes: 21 additions & 17 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ build = "build.py"

[tool.poetry.dependencies]
python = "^3.9"
docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", rev = "ce0b7ee64750944e530d03a1cf22a75636fa2775"}
docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", rev = "089d692b95dd704a503e9d15c5072e2ff58a65e6"}
deepsearch-toolkit = { version = ">=0.31.0", optional = true }
tabulate = ">=0.8.9"
numpy = [
Expand Down

0 comments on commit 13350bf

Please sign in to comment.