feat: Code and equation model for PDF and code blocks in markdown (#752)

* propagated changes for new CodeItem class Signed-off-by: Matteo Omenetti <[email protected]> * Rebased branch on latest main. changes for CodeItem Signed-off-by: Matteo Omenetti <[email protected]> * removed unused files Signed-off-by: Matteo Omenetti <[email protected]> * chore: update lockfile Signed-off-by: Christoph Auer <[email protected]> * pin latest docling-core Signed-off-by: Michele Dolfi <[email protected]> * update docling-core pinning Signed-off-by: Michele Dolfi <[email protected]> * pin docling-core Signed-off-by: Michele Dolfi <[email protected]> * use new add_code in backends and update typing in MD backend Signed-off-by: Michele Dolfi <[email protected]> * added if statement for backend Signed-off-by: Matteo Omenetti <[email protected]> * removed unused import Signed-off-by: Matteo Omenetti <[email protected]> * removed print statements Signed-off-by: Matteo Omenetti <[email protected]> * gt for new pdf Signed-off-by: Matteo Omenetti <[email protected]> * Update docling/pipeline/standard_pdf_pipeline.py Co-authored-by: Michele Dolfi <[email protected]> Signed-off-by: Matteo <[email protected]> * fixed doc comment of __call__ function of code_formula_model Signed-off-by: Matteo Omenetti <[email protected]> * fix artifacts_path type Signed-off-by: Michele Dolfi <[email protected]> * move imports Signed-off-by: Michele Dolfi <[email protected]> * move expansion_factor to base class Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Matteo Omenetti <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: Matteo <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
DS4SD · Jan 24, 2025 · 3213b24 · 3213b24
1 parent c58f75d
commit 3213b24
Show file tree

Hide file tree

Showing 28 changed files with 707 additions and 318 deletions.
diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py
@@ -24,7 +24,6 @@
 
 
 class AsciiDocBackend(DeclarativeDocumentBackend):
-
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -215,7 +215,7 @@ def handle_code(self, element, idx, doc):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], label=label, text=text)
 
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""

diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
@@ -3,19 +3,22 @@
 import warnings
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union
 
 import marko
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
+    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    NodeItem,
     TableCell,
     TableData,
+    TextItem,
 )
 from marko import Markdown
 
@@ -27,8 +30,7 @@
 
 
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-
-    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
 
@@ -90,13 +92,13 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
             ) from e
         return
 
-    def close_table(self, doc=None):
+    def close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells = []
+            tcells: List[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -137,15 +139,19 @@ def close_table(self, doc=None):
             self.in_table = False
             self.md_table_buffer = []  # clean table markdown buffer
             # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            table_data = TableData(
+                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+            )
             # Populate
             for tcell in tcells:
-                data.table_cells.append(tcell)
+                table_data.table_cells.append(tcell)
             if len(tcells) > 0:
-                doc.add_table(data=data)
+                doc.add_table(data=table_data)
         return
 
-    def process_inline_text(self, parent_element, doc=None):
+    def process_inline_text(
+        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    ):
         # self.inline_text_buffer += str(text_in)
         txt = self.inline_text_buffer.strip()
         if len(txt) > 0:
@@ -156,14 +162,20 @@ def process_inline_text(self, parent_element, doc=None):
             )
         self.inline_text_buffer = ""
 
-    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+    def iterate_elements(
+        self,
+        element: marko.block.Element,
+        depth: int,
+        doc: DoclingDocument,
+        parent_element: Optional[NodeItem] = None,
+    ):
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"
+                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
             if element.level == 1:
                 doc_label = DocItemLabel.TITLE
@@ -172,10 +184,10 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
 
             # Header could have arbitrary inclusion of bold, italic or emphasis,
             # hence we need to traverse the tree to get full text of a header
-            strings = []
+            strings: List[str] = []
 
             # Define a recursive function to traverse the tree
-            def traverse(node):
+            def traverse(node: marko.block.BlockElement):
                 # Check if the node has a "children" attribute
                 if hasattr(node, "children"):
                     # If "children" is a list, continue traversal
@@ -209,9 +221,13 @@ def traverse(node):
             self.process_inline_text(parent_element, doc)
             _log.debug(" - List item")
 
-            snippet_text = str(element.children[0].children[0].children)
+            snippet_text = str(element.children[0].children[0].children)  # type: ignore
             is_numbered = False
-            if parent_element.label == GroupLabel.ORDERED_LIST:
+            if (
+                parent_element is not None
+                and isinstance(parent_element, DocItem)
+                and parent_element.label == GroupLabel.ORDERED_LIST
+            ):
                 is_numbered = True
             doc.add_list_item(
                 enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +237,14 @@ def traverse(node):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            doc.add_picture(parent=parent_element, caption=element.title)
+
+            fig_caption: Optional[TextItem] = None
+            if element.title is not None and element.title != "":
+                fig_caption = doc.add_text(
+                    label=DocItemLabel.CAPTION, text=element.title
+                )
+
+            doc.add_picture(parent=parent_element, caption=fig_caption)
 
         elif isinstance(element, marko.block.Paragraph):
             self.process_inline_text(parent_element, doc)
@@ -252,27 +275,21 @@ def traverse(node):
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.block.CodeBlock):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.block.FencedCode):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.inline.LineBreak):
             self.process_inline_text(parent_element, doc)

diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py
@@ -44,7 +44,6 @@ class ExcelTable(BaseModel):
 
 
 class MsExcelDocumentBackend(DeclarativeDocumentBackend):
-
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 

diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
@@ -26,7 +26,6 @@
 
 
 class MsWordDocumentBackend(DeclarativeDocumentBackend):
-
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
         self.XML_KEY = (

diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py
@@ -12,7 +12,6 @@
 
 
 class PdfPageBackend(ABC):
-
     @abstractmethod
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
         pass
@@ -45,7 +44,6 @@ def unload(self):
 
 
 class PdfDocumentBackend(PaginatedDocumentBackend):
-
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -1,17 +1,11 @@
 import logging
 import os
-import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, List, Literal, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic_settings import (
-    BaseSettings,
-    PydanticBaseSettingsSource,
-    SettingsConfigDict,
-)
-from typing_extensions import deprecated
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
 
 _log = logging.getLogger(__name__)
 
@@ -225,6 +219,8 @@ class PdfPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+    do_code_enrichment: bool = False  # True: perform code OCR
+    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[

diff --git a/docling/models/base_model.py b/docling/models/base_model.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Generic, Iterable, Optional
 
-from docling_core.types.doc import DoclingDocument, NodeItem, TextItem
+from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
 from typing_extensions import TypeVar
 
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@@ -53,6 +53,7 @@ class BaseItemAndImageEnrichmentModel(
 ):
 
     images_scale: float
+    expansion_factor: float = 0.0
 
     def prepare_element(
         self, conv_res: ConversionResult, element: NodeItem
@@ -62,8 +63,22 @@ def prepare_element(
 
         assert isinstance(element, TextItem)
         element_prov = element.prov[0]
+
+        bbox = element_prov.bbox
+        width = bbox.r - bbox.l
+        height = bbox.t - bbox.b
+
+        # TODO: move to a utility in the BoundingBox class
+        expanded_bbox = BoundingBox(
+            l=bbox.l - width * self.expansion_factor,
+            t=bbox.t + height * self.expansion_factor,
+            r=bbox.r + width * self.expansion_factor,
+            b=bbox.b - height * self.expansion_factor,
+            coord_origin=bbox.coord_origin,
+        )
+
         page_ix = element_prov.page_no - 1
         cropped_image = conv_res.pages[page_ix].get_image(
-            scale=self.images_scale, cropbox=element_prov.bbox
+            scale=self.images_scale, cropbox=expanded_bbox
         )
         return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,7 +24,6 @@


		class AsciiDocBackend(DeclarativeDocumentBackend):

		def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
		super().__init__(in_doc, path_or_stream)

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -44,7 +44,6 @@ class ExcelTable(BaseModel):


		class MsExcelDocumentBackend(DeclarativeDocumentBackend):

		def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
		super().__init__(in_doc, path_or_stream)

Expand Down