Skip to content

Commit

Permalink
feat: Code and equation model for PDF and code blocks in markdown (#752)
Browse files Browse the repository at this point in the history
* propagated changes for new CodeItem class

Signed-off-by: Matteo Omenetti <[email protected]>

* Rebased branch on latest main. changes for CodeItem

Signed-off-by: Matteo Omenetti <[email protected]>

* removed unused files

Signed-off-by: Matteo Omenetti <[email protected]>

* chore: update lockfile

Signed-off-by: Christoph Auer <[email protected]>

* pin latest docling-core

Signed-off-by: Michele Dolfi <[email protected]>

* update docling-core pinning

Signed-off-by: Michele Dolfi <[email protected]>

* pin docling-core

Signed-off-by: Michele Dolfi <[email protected]>

* use new add_code in backends and update typing in MD backend

Signed-off-by: Michele Dolfi <[email protected]>

* added if statement for backend

Signed-off-by: Matteo Omenetti <[email protected]>

* removed unused import

Signed-off-by: Matteo Omenetti <[email protected]>

* removed print statements

Signed-off-by: Matteo Omenetti <[email protected]>

* gt for new pdf

Signed-off-by: Matteo Omenetti <[email protected]>

* Update docling/pipeline/standard_pdf_pipeline.py

Co-authored-by: Michele Dolfi <[email protected]>
Signed-off-by: Matteo <[email protected]>

* fixed doc comment of __call__ function of code_formula_model

Signed-off-by: Matteo Omenetti <[email protected]>

* fix artifacts_path type

Signed-off-by: Michele Dolfi <[email protected]>

* move imports

Signed-off-by: Michele Dolfi <[email protected]>

* move expansion_factor to base class

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Matteo Omenetti <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Signed-off-by: Matteo <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
  • Loading branch information
4 people authored Jan 24, 2025
1 parent c58f75d commit 3213b24
Show file tree
Hide file tree
Showing 28 changed files with 707 additions and 318 deletions.
1 change: 0 additions & 1 deletion docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@


class AsciiDocBackend(DeclarativeDocumentBackend):

def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

Expand Down
2 changes: 1 addition & 1 deletion docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def handle_code(self, element, idx, doc):
label = DocItemLabel.CODE
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)
doc.add_code(parent=self.parents[self.level], label=label, text=text)

def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
Expand Down
71 changes: 44 additions & 27 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
import warnings
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from typing import List, Optional, Set, Union

import marko
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from marko import Markdown

Expand All @@ -27,8 +30,7 @@


class MarkdownDocumentBackend(DeclarativeDocumentBackend):

def shorten_underscore_sequences(self, markdown_text, max_length=10):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"

Expand Down Expand Up @@ -90,13 +92,13 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) from e
return

def close_table(self, doc=None):
def close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
_log.debug(md_table_row)
_log.debug("=== TABLE END ===")
tcells = []
tcells: List[TableCell] = []
result_table = []
for n, md_table_row in enumerate(self.md_table_buffer):
data = []
Expand Down Expand Up @@ -137,15 +139,19 @@ def close_table(self, doc=None):
self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer
# Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
table_data = TableData(
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
)
# Populate
for tcell in tcells:
data.table_cells.append(tcell)
table_data.table_cells.append(tcell)
if len(tcells) > 0:
doc.add_table(data=data)
doc.add_table(data=table_data)
return

def process_inline_text(self, parent_element, doc=None):
def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
):
# self.inline_text_buffer += str(text_in)
txt = self.inline_text_buffer.strip()
if len(txt) > 0:
Expand All @@ -156,14 +162,20 @@ def process_inline_text(self, parent_element, doc=None):
)
self.inline_text_buffer = ""

def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
def iterate_elements(
self,
element: marko.block.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}"
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
Expand All @@ -172,10 +184,10 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):

# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings = []
strings: List[str] = []

# Define a recursive function to traverse the tree
def traverse(node):
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
Expand Down Expand Up @@ -209,9 +221,13 @@ def traverse(node):
self.process_inline_text(parent_element, doc)
_log.debug(" - List item")

snippet_text = str(element.children[0].children[0].children)
snippet_text = str(element.children[0].children[0].children) # type: ignore
is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST:
if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text
Expand All @@ -221,7 +237,14 @@ def traverse(node):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)

fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
)

doc.add_picture(parent=parent_element, caption=fig_caption)

elif isinstance(element, marko.block.Paragraph):
self.process_inline_text(parent_element, doc)
Expand Down Expand Up @@ -252,27 +275,21 @@ def traverse(node):
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.inline.LineBreak):
self.process_inline_text(parent_element, doc)
Expand Down
1 change: 0 additions & 1 deletion docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ class ExcelTable(BaseModel):


class MsExcelDocumentBackend(DeclarativeDocumentBackend):

def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

Expand Down
1 change: 0 additions & 1 deletion docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@


class MsWordDocumentBackend(DeclarativeDocumentBackend):

def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = (
Expand Down
2 changes: 0 additions & 2 deletions docling/backend/pdf_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@


class PdfPageBackend(ABC):

@abstractmethod
def get_text_in_rect(self, bbox: BoundingBox) -> str:
pass
Expand Down Expand Up @@ -45,7 +44,6 @@ def unload(self):


class PdfDocumentBackend(PaginatedDocumentBackend):

def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

Expand Down
14 changes: 5 additions & 9 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
import logging
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
from typing import Any, List, Literal, Optional, Union

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from pydantic_settings import (
BaseSettings,
PydanticBaseSettingsSource,
SettingsConfigDict,
)
from typing_extensions import deprecated
from pydantic import BaseModel, ConfigDict, Field, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -225,6 +219,8 @@ class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code

table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
Expand Down
19 changes: 17 additions & 2 deletions docling/models/base_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional

from docling_core.types.doc import DoclingDocument, NodeItem, TextItem
from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
from typing_extensions import TypeVar

from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
Expand Down Expand Up @@ -53,6 +53,7 @@ class BaseItemAndImageEnrichmentModel(
):

images_scale: float
expansion_factor: float = 0.0

def prepare_element(
self, conv_res: ConversionResult, element: NodeItem
Expand All @@ -62,8 +63,22 @@ def prepare_element(

assert isinstance(element, TextItem)
element_prov = element.prov[0]

bbox = element_prov.bbox
width = bbox.r - bbox.l
height = bbox.t - bbox.b

# TODO: move to a utility in the BoundingBox class
expanded_bbox = BoundingBox(
l=bbox.l - width * self.expansion_factor,
t=bbox.t + height * self.expansion_factor,
r=bbox.r + width * self.expansion_factor,
b=bbox.b - height * self.expansion_factor,
coord_origin=bbox.coord_origin,
)

page_ix = element_prov.page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=element_prov.bbox
scale=self.images_scale, cropbox=expanded_bbox
)
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)
Loading

0 comments on commit 3213b24

Please sign in to comment.