Skip to content

Commit

Permalink
Reverse label mapping for legacy_format, fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 25, 2024
1 parent b7b35a7 commit abab9ee
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 34 deletions.
35 changes: 28 additions & 7 deletions deepsearch_glm/utils/doc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:

def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
doc: DoclingDocument = DoclingDocument(description={},
file_info=FileInfo(document_hash=doc_glm["file-info"]["document-hash"]))
file_info=FileInfo(filename=doc_glm["file-info"]["filename"], document_hash=doc_glm["file-info"]["document-hash"]))

if "properties" in doc_glm:
props = pd.DataFrame(
Expand Down Expand Up @@ -299,6 +299,27 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):
"""Convert Document object (with `body`) to its legacy format (with `main-text`)"""

reverse_label_mapping = {
DocItemLabel.CAPTION.value: "Caption",
DocItemLabel.FOOTNOTE.value: "Footnote",
DocItemLabel.FORMULA.value: "Formula",
DocItemLabel.LIST_ITEM.value: "List-item",
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
DocItemLabel.PAGE_HEADER.value: "Page-header",
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
DocItemLabel.SECTION_HEADER.value: "Section-header",
DocItemLabel.TABLE.value: "Table",
DocItemLabel.TEXT.value: "Text",
DocItemLabel.TITLE.value: "Title",
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
DocItemLabel.CODE.value: "Code",
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
DocItemLabel.FORM.value: "Form",
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
DocItemLabel.PARAGRAPH.value: "paragraph"
}

doc_leg["main-text"] = []
doc_leg["figures"] = []
doc_leg["tables"] = []
Expand Down Expand Up @@ -360,7 +381,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):

pitem = {
"text": text,
"name": nelem["name"],
"name": reverse_label_mapping[nelem["name"]],
"type": nelem["type"],
"prov": [
{
Expand Down Expand Up @@ -393,7 +414,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):

pitem = {
"$ref": f"#/figures/{find}",
"name": pelem["name"],
"name": reverse_label_mapping[pelem["name"]],
"type": pelem["type"],
}
doc_leg["main-text"].append(pitem)
Expand All @@ -418,7 +439,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):

pitem = {
"text": text,
"name": nelem["name"],
"name": reverse_label_mapping[nelem["name"]],
"type": nelem["type"],
"prov": [
{
Expand Down Expand Up @@ -449,7 +470,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):

pitem = {
"$ref": f"#/tables/{tind}",
"name": pelem["name"],
"name": reverse_label_mapping[pelem["name"]],
"type": pelem["type"],
}
doc_leg["main-text"].append(pitem)
Expand All @@ -458,7 +479,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):
text = obj["text"][span_i:span_j]

type_label = pelem["type"]
name_label = pelem["name"]
name_label = reverse_label_mapping[pelem["name"]]
if update_name_label and len(props) > 0 and type_label == "paragraph":
prop = props[
(props["type"] == "semantic") & (props["subj_path"] == iref)
Expand All @@ -482,7 +503,7 @@ def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):

else:
pitem = {
"name": pelem["name"],
"name": reverse_label_mapping[pelem["name"]],
"type": pelem["type"],
"prov": [
{"bbox": pelem["bbox"], "page": pelem["page"], "span": [0, 0]}
Expand Down
29 changes: 3 additions & 26 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ build = "build.py"

[tool.poetry.dependencies]
python = "^3.9"
docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", rev = "b50d53c05bf755ddb73c7d33ececdb542877662a"}
docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", rev = "ce0b7ee64750944e530d03a1cf22a75636fa2775"}
deepsearch-toolkit = { version = ">=0.31.0", optional = true }
tabulate = ">=0.8.9"
numpy = [
Expand Down

0 comments on commit abab9ee

Please sign in to comment.