diff --git a/deepsearch_glm/utils/doc_utils.py b/deepsearch_glm/utils/doc_utils.py index 1ecbda1f..d228eec7 100644 --- a/deepsearch_glm/utils/doc_utils.py +++ b/deepsearch_glm/utils/doc_utils.py @@ -59,7 +59,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]: def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: doc: DoclingDocument = DoclingDocument(description={}, - file_info=FileInfo(document_hash=doc_glm["file-info"]["document-hash"])) + file_info=FileInfo(filename=doc_glm["file-info"]["filename"], document_hash=doc_glm["file-info"]["document-hash"])) if "properties" in doc_glm: props = pd.DataFrame( @@ -299,6 +299,27 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): """Convert Document object (with `body`) to its legacy format (with `main-text`)""" + reverse_label_mapping = { + DocItemLabel.CAPTION.value: "Caption", + DocItemLabel.FOOTNOTE.value: "Footnote", + DocItemLabel.FORMULA.value: "Formula", + DocItemLabel.LIST_ITEM.value: "List-item", + DocItemLabel.PAGE_FOOTER.value: "Page-footer", + DocItemLabel.PAGE_HEADER.value: "Page-header", + DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples. + DocItemLabel.SECTION_HEADER.value: "Section-header", + DocItemLabel.TABLE.value: "Table", + DocItemLabel.TEXT.value: "Text", + DocItemLabel.TITLE.value: "Title", + DocItemLabel.DOCUMENT_INDEX.value: "Document Index", + DocItemLabel.CODE.value: "Code", + DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected", + DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected", + DocItemLabel.FORM.value: "Form", + DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region", + DocItemLabel.PARAGRAPH.value: "paragraph" + } + doc_leg["main-text"] = [] doc_leg["figures"] = [] doc_leg["tables"] = [] @@ -360,7 +381,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): pitem = { "text": text, - "name": nelem["name"], + "name": reverse_label_mapping[nelem["name"]], "type": nelem["type"], "prov": [ { @@ -393,7 +414,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): pitem = { "$ref": f"#/figures/{find}", - "name": pelem["name"], + "name": reverse_label_mapping[pelem["name"]], "type": pelem["type"], } doc_leg["main-text"].append(pitem) @@ -418,7 +439,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): pitem = { "text": text, - "name": nelem["name"], + "name": reverse_label_mapping[nelem["name"]], "type": nelem["type"], "prov": [ { @@ -449,7 +470,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): pitem = { "$ref": f"#/tables/{tind}", - "name": pelem["name"], + "name": reverse_label_mapping[pelem["name"]], "type": pelem["type"], } doc_leg["main-text"].append(pitem) @@ -458,7 +479,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): text = obj["text"][span_i:span_j] type_label = pelem["type"] - name_label = pelem["name"] + name_label = reverse_label_mapping[pelem["name"]] if update_name_label and len(props) > 0 and type_label == "paragraph": prop = props[ (props["type"] == "semantic") & (props["subj_path"] == iref) @@ -482,7 +503,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False): else: pitem = { - "name": pelem["name"], + "name": reverse_label_mapping[pelem["name"]], "type": pelem["type"], "prov": [ {"bbox": pelem["bbox"], "page": pelem["page"], "span": [0, 0]} diff --git a/poetry.lock b/poetry.lock index 77975ffb..ca307e25 100644 --- a/poetry.lock +++ b/poetry.lock @@ -676,14 +676,13 @@ jsonref = "^1.1.0" jsonschema = "^4.16.0" pandas = "^2.2.2" pydantic = "^2.6.0" -pydantic-extra-types = "^2.9.0" tabulate = "^0.9.0" [package.source] type = "git" url = "ssh://git@github.com/DS4SD/docling-core.git" -reference = "b50d53c05bf755ddb73c7d33ececdb542877662a" -resolved_reference = "b50d53c05bf755ddb73c7d33ececdb542877662a" +reference = "ce0b7ee64750944e530d03a1cf22a75636fa2775" +resolved_reference = "ce0b7ee64750944e530d03a1cf22a75636fa2775" [[package]] name = "docutils" @@ -2132,28 +2131,6 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" -[[package]] -name = "pydantic-extra-types" -version = "2.9.0" -description = "Extra Pydantic types." -optional = false -python-versions = ">=3.8" -files = [ - {file = "pydantic_extra_types-2.9.0-py3-none-any.whl", hash = "sha256:f0bb975508572ba7bf3390b7337807588463b7248587e69f43b1ad7c797530d0"}, - {file = "pydantic_extra_types-2.9.0.tar.gz", hash = "sha256:e061c01636188743bb69f368dcd391f327b8cfbfede2fe1cbb1211b06601ba3b"}, -] - -[package.dependencies] -pydantic = ">=2.5.2" - -[package.extras] -all = ["pendulum (>=3.0.0,<4.0.0)", "phonenumbers (>=8,<9)", "pycountry (>=23)", "python-ulid (>=1,<2)", "python-ulid (>=1,<3)", "pytz (>=2024.1)", "semver (>=3.0.2)", "tzdata (>=2024.1)"] -pendulum = ["pendulum (>=3.0.0,<4.0.0)"] -phonenumbers = ["phonenumbers (>=8,<9)"] -pycountry = ["pycountry (>=23)"] -python-ulid = ["python-ulid (>=1,<2)", "python-ulid (>=1,<3)"] -semver = ["semver (>=3.0.2)"] - [[package]] name = "pydantic-settings" version = "2.5.2" @@ -3026,4 +3003,4 @@ toolkit = ["deepsearch-toolkit"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c02e5606fbdde350f9651f5bcf0f2372b6b2a0e7f483606bfd76a80ecdbb9c09" +content-hash = "3a6eb7c7cd371082ae860c804eb27ec09fa03d7cca64893f9ae88a2468fd8ef1" diff --git a/pyproject.toml b/pyproject.toml index e5eeb473..099baa82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ build = "build.py" [tool.poetry.dependencies] python = "^3.9" -docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", rev = "b50d53c05bf755ddb73c7d33ececdb542877662a"} +docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", rev = "ce0b7ee64750944e530d03a1cf22a75636fa2775"} deepsearch-toolkit = { version = ">=0.31.0", optional = true } tabulate = ">=0.8.9" numpy = [