8
8
import asyncio
9
9
import logging
10
10
import time
11
-
11
+ from pathlib import Path
12
12
from docling .datamodel .base_models import InputFormat
13
13
from docling .document_converter import DocumentConverter , WordFormatOption
14
14
from docling .pipeline .simple_pipeline import SimplePipeline
31
31
DocumentData ,
32
32
DocumentParser ,
33
33
TableDataItem ,
34
+ ImageDataItem ,
35
+ TextDataItem ,
36
+ FormulaDataItem
34
37
)
35
38
from parsers .parser_registry import register_parser
36
39
@@ -54,7 +57,7 @@ def __init__(self) -> None:
54
57
)
55
58
logger .debug ("DocxDocumentParser initialized with SimplePipeline" )
56
59
57
- async def parse (self , file_path : str ) -> DocumentData :
60
+ async def parse (self , file_path : Path ) -> DocumentData :
58
61
"""异步解析DOCX文件
59
62
60
63
Args:
@@ -70,16 +73,6 @@ async def parse(self, file_path: str) -> DocumentData:
70
73
result = await loop .run_in_executor (None , self ._converter .convert , file_path )
71
74
doc_data = result .document
72
75
73
- # 确保文档数据包含所有必要的属性
74
- if not hasattr (doc_data , 'name' ):
75
- doc_data .name = 'Unknown Document'
76
- if not hasattr (doc_data , 'texts' ):
77
- doc_data .texts = []
78
- if not hasattr (doc_data , 'pictures' ):
79
- doc_data .pictures = []
80
- if not hasattr (doc_data , 'tables' ):
81
- doc_data .tables = []
82
-
83
76
title = self ._extract_title (doc_data )
84
77
images = self ._extract_images (doc_data .pictures )
85
78
tables = self ._extract_tables (doc_data .tables )
@@ -117,20 +110,18 @@ def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
117
110
"""
118
111
image_items = []
119
112
for idx , picture in enumerate (pictures ):
120
- image_uri = ""
121
- if hasattr (picture , 'image' ) and picture .image and hasattr (picture .image , 'uri' ):
122
- image_uri = str (picture .image .uri )
123
-
124
- caption = ""
125
- if hasattr (picture , 'captions' ) and picture .captions :
126
- caption = str (picture .captions [0 ])
127
-
113
+ image_uri = str (picture .image .uri )
114
+ caption = [caption .cref for caption in picture .captions ]
115
+ footnote = [footnote .cref for footnote in picture .footnotes ]
128
116
image_items .append (
129
117
ChunkData (
130
118
type = ChunkType .IMAGE ,
131
- name = getattr (picture , 'self_ref' , None ) or f"#/pictures/{ idx } " ,
132
- content = image_uri ,
133
- description = caption
119
+ name = f"#/pictures/{ idx } " ,
120
+ content = ImageDataItem (
121
+ uri = image_uri ,
122
+ caption = caption ,
123
+ footnote = footnote
124
+ )
134
125
)
135
126
)
136
127
@@ -145,32 +136,22 @@ def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
145
136
Returns:
146
137
List[ChunkData]: 表格列表
147
138
"""
148
- # 添加安全检查,确保 tables 参数存在且可迭代
149
- if not tables or not hasattr (tables , '__iter__' ):
150
- return []
151
-
152
139
table_items : list [ChunkData ] = []
153
140
for table in tables :
154
- if not hasattr (table , 'data' ) or not hasattr (table .data , 'grid' ):
155
- continue
156
- if len (table .data .grid ) == 0 :
157
- continue
158
-
159
- table_cells = table .data .grid
160
- row_headers = [cell .text for cell in table_cells [0 ] if cell .row_header ]
161
- column_headers = [cell .text for cell in table_cells [0 ] if cell .column_header ]
162
- data = [[cell .text for cell in row ] for row in table_cells [1 :]]
141
+ caption = [caption .cref for caption in table .captions ]
142
+ footnote = [footnote .cref for footnote in table .footnotes ]
143
+ grid = [[cell .text if cell .text else '' for cell in row ] for row in table .data .grid ]
163
144
table_data = TableDataItem (
164
145
rows = table .data .num_rows ,
165
146
columns = table .data .num_cols ,
166
- row_headers = row_headers ,
167
- column_headers = column_headers ,
168
- data = data
147
+ grid = grid ,
148
+ caption = caption ,
149
+ footnote = footnote
169
150
)
170
151
table_items .append (
171
152
ChunkData (
172
153
type = ChunkType .TABLE ,
173
- name = getattr ( table , 'self_ref' , None ) or f"table- { len (table_items )} " ,
154
+ name = f"#/tables/ { len (table_items )} " ,
174
155
content = table_data
175
156
)
176
157
)
@@ -212,16 +193,20 @@ def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeIte
212
193
text_items .append (
213
194
ChunkData (
214
195
type = ChunkType .FORMULA ,
215
- name = item .self_ref or f"formula-{ len (text_items )} " ,
216
- content = item .text
196
+ name = f"formula-{ len (text_items )} " ,
197
+ content = FormulaDataItem (
198
+ text = item .text
199
+ )
217
200
)
218
201
)
219
202
case _:
220
203
text_items .append (
221
204
ChunkData (
222
205
type = ChunkType .TEXT ,
223
- name = f"text-{ len (text_items )} " ,
224
- content = item .text
206
+ name = f"#/texts/{ len (text_items )} " ,
207
+ content = TextDataItem (
208
+ text = item .text
209
+ )
225
210
)
226
211
)
227
212
return text_items
0 commit comments