Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
feat: create a docling reader for docs as JSON.
Browse files Browse the repository at this point in the history
New class DoclingJSONReader from BaseDoclingReader to read docs as JSON.
Unit test for this class

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam committed Sep 5, 2024
1 parent c236a43 commit 3041376
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 0 deletions.
1 change: 1 addition & 0 deletions quackling/llama_index/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.readers.docling_json_reader import DoclingJSONReader # noqa
from quackling.llama_index.readers.docling_pdf_reader import DoclingPDFReader # noqa
25 changes: 25 additions & 0 deletions quackling/llama_index/readers/docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

import json
from typing import Iterable

from docling_core.types import Document as DLDocument
from llama_index.core.schema import Document as LIDocument

from quackling.llama_index.readers.base import BaseDoclingReader


class DoclingJSONReader(BaseDoclingReader):
def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:

file_paths = file_path if isinstance(file_path, list) else [file_path]

for source in file_paths:
with open(source, encoding="utf-8") as file_obj:
data = json.load(file_obj)
dl_doc: DLDocument = DLDocument.model_validate(data)
li_doc: LIDocument = self._create_li_doc_from_dl_doc(dl_doc=dl_doc)
yield li_doc
14 changes: 14 additions & 0 deletions tests/unit/test_li_docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.readers import DoclingJSONReader


def test_lazy_load_data():
reader = DoclingJSONReader(parse_type=DoclingJSONReader.ParseType.JSON)

file_path = "tests/unit/data/0_inp_dl_doc.json"
li_docs = list(reader.lazy_load_data(file_path))
assert len(li_docs) == 1

0 comments on commit 3041376

Please sign in to comment.