Skip to content

Commit 6b6f0a7

Browse files
committed
feat: add word parser
1 parent d5e71b0 commit 6b6f0a7

File tree

5 files changed

+2033
-4
lines changed

5 files changed

+2033
-4
lines changed

parsers/base_models.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ class TableDataItem(BaseModel):
2626
class ChunkData(BaseModel):
2727
"""块数据类"""
2828
type: ChunkType
29-
name: str
30-
content: str|TableDataItem = ""
31-
description: str = ""
29+
name: str|None = None
30+
content: str|TableDataItem|None = None
31+
description: str|None = None
3232

3333
class DocumentData(BaseModel):
3434
"""解析结果类"""
35-
title: str = ""
35+
title: str|None = None
3636
texts: list[ChunkData] = Field(default_factory=list)
3737
tables: list[ChunkData] = Field(default_factory=list)
3838
images: list[ChunkData] = Field(default_factory=list)

parsers/docx_parser.py

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
"""
2+
DOCX文档解析器模块
3+
4+
该模块提供使用Docling库解析DOCX文档并提取结构化内容的功能。
5+
支持标题、段落、列表、表格和图片的识别与输出。
6+
"""
7+
8+
import asyncio
9+
import logging
10+
import time
11+
12+
from docling.datamodel.base_models import InputFormat
13+
from docling.document_converter import DocumentConverter, WordFormatOption
14+
from docling.pipeline.simple_pipeline import SimplePipeline
15+
from docling_core.types.doc.document import (
16+
CodeItem,
17+
DocItemLabel,
18+
DoclingDocument,
19+
FormulaItem,
20+
ListItem,
21+
PictureItem,
22+
SectionHeaderItem,
23+
TableItem,
24+
TextItem,
25+
TitleItem,
26+
)
27+
28+
from parsers.base_models import (
29+
ChunkData,
30+
ChunkType,
31+
DocumentData,
32+
DocumentParser,
33+
TableDataItem,
34+
)
35+
36+
logger = logging.getLogger(__name__)
37+
38+
39+
class DocxDocumentParser(DocumentParser):
40+
"""DOCX文档解析器
41+
42+
使用Docling的现代解析管道提取DOCX文档的结构化内容。
43+
支持异步解析接口,符合DocumentParser抽象基类。
44+
"""
45+
46+
def __init__(self) -> None:
47+
"""初始化解析器"""
48+
super().__init__()
49+
self.supported_formats = [".docx"]
50+
self._converter = DocumentConverter(
51+
format_options={InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline)},
52+
allowed_formats=[InputFormat.DOCX]
53+
)
54+
logger.debug("DocxDocumentParser initialized with SimplePipeline")
55+
56+
def can_parse(self, file_path: str) -> bool:
57+
"""检查是否可以解析该文件
58+
59+
Args:
60+
file_path: 文件路径
61+
62+
Returns:
63+
bool: 是否支持该文件格式
64+
"""
65+
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
66+
67+
async def parse(self, file_path: str) -> DocumentData:
68+
"""异步解析DOCX文件
69+
70+
Args:
71+
file_path: DOCX文件路径
72+
73+
Returns:
74+
ParseResult: 解析结果,包含标题、内容、处理时间和错误信息
75+
"""
76+
start_time = time.time()
77+
try:
78+
# 执行同步转换(在异步中运行)
79+
loop = asyncio.get_event_loop()
80+
result = await loop.run_in_executor(None, self._converter.convert, file_path)
81+
doc_data = result.document
82+
83+
# 确保文档数据包含所有必要的属性
84+
if not hasattr(doc_data, 'name'):
85+
doc_data.name = 'Unknown Document'
86+
if not hasattr(doc_data, 'texts'):
87+
doc_data.texts = []
88+
if not hasattr(doc_data, 'pictures'):
89+
doc_data.pictures = []
90+
if not hasattr(doc_data, 'tables'):
91+
doc_data.tables = []
92+
93+
title = self._extract_title(doc_data)
94+
images = self._extract_images(doc_data.pictures)
95+
tables = self._extract_tables(doc_data.tables)
96+
texts = self._extract_texts(doc_data.texts)
97+
98+
processing_time = time.time() - start_time
99+
logger.info(f"Successfully parsed DOCX: {file_path} (took {processing_time:.2f}s)")
100+
return DocumentData(
101+
title=title,
102+
texts=texts,
103+
tables=tables,
104+
images=images,
105+
processing_time=processing_time,
106+
success=True
107+
)
108+
109+
except Exception as e:
110+
processing_time = time.time() - start_time
111+
error_msg = f"Failed to parse DOCX file {file_path}: {type(e).__name__}: {e}"
112+
logger.exception(error_msg) # 记录完整堆栈
113+
return DocumentData(
114+
success=False,
115+
error_message=str(e),
116+
processing_time=processing_time
117+
)
118+
119+
def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
120+
"""提取文档中的图片
121+
122+
Args:
123+
pictures: 图片列表
124+
125+
Returns:
126+
List[ChunkData]: 图片列表
127+
"""
128+
image_items = []
129+
for idx, picture in enumerate(pictures):
130+
image_uri = ""
131+
if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'):
132+
image_uri = str(picture.image.uri)
133+
134+
caption = ""
135+
if hasattr(picture, 'captions') and picture.captions:
136+
caption = str(picture.captions[0])
137+
138+
image_items.append(
139+
ChunkData(
140+
type=ChunkType.IMAGE,
141+
name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}",
142+
content=image_uri,
143+
description=caption
144+
)
145+
)
146+
147+
return image_items
148+
149+
def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
150+
"""提取文档中的表格
151+
152+
Args:
153+
tables: 表格列表
154+
155+
Returns:
156+
List[ChunkData]: 表格列表
157+
"""
158+
# 添加安全检查,确保 tables 参数存在且可迭代
159+
if not tables or not hasattr(tables, '__iter__'):
160+
return []
161+
162+
table_items: list[ChunkData] = []
163+
for table in tables:
164+
if not hasattr(table, 'data') or not hasattr(table.data, 'grid'):
165+
continue
166+
if len(table.data.grid) == 0:
167+
continue
168+
169+
table_cells = table.data.grid
170+
row_headers = [cell.text for cell in table_cells[0] if cell.row_header]
171+
column_headers = [cell.text for cell in table_cells[0] if cell.column_header]
172+
data = [[cell.text for cell in row] for row in table_cells[1:]]
173+
table_data = TableDataItem(
174+
rows=table.data.num_rows,
175+
columns=table.data.num_cols,
176+
row_headers=row_headers,
177+
column_headers=column_headers,
178+
data=data
179+
)
180+
table_items.append(
181+
ChunkData(
182+
type=ChunkType.TABLE,
183+
name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}",
184+
content=table_data
185+
)
186+
)
187+
188+
return table_items
189+
190+
def _extract_title(self, doc_data: DoclingDocument) -> str:
191+
"""提取文档中的标题
192+
Args:
193+
doc_data: 文档数据
194+
Returns:
195+
str: 标题
196+
"""
197+
title = ""
198+
for item in doc_data.texts:
199+
if hasattr(item, 'label') and item.label == DocItemLabel.TITLE:
200+
title = item.text
201+
break
202+
return title if title else doc_data.name
203+
204+
def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeItem|FormulaItem|TextItem]) -> list[ChunkData]:
205+
"""提取文档中的文本
206+
207+
Args:
208+
text: 文本列表
209+
210+
Returns:
211+
List[ChunkData]: 文本列表
212+
"""
213+
text_items: list[ChunkData] = []
214+
215+
for item in texts:
216+
if not hasattr(item, 'label'):
217+
continue
218+
if not hasattr(item, 'text') or len(item.text) == 0:
219+
continue
220+
match item.label:
221+
case DocItemLabel.FORMULA:
222+
text_items.append(
223+
ChunkData(
224+
type=ChunkType.FORMULA,
225+
name=item.self_ref or f"formula-{len(text_items)}",
226+
content=item.text
227+
)
228+
)
229+
case _:
230+
text_items.append(
231+
ChunkData(
232+
type=ChunkType.TEXT,
233+
name=f"text-{len(text_items)}",
234+
content=item.text
235+
)
236+
)
237+
return text_items

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ dependencies = [
1515
"redis>=6.4.0",
1616
"openpyxl>=3.1.5",
1717
"pydantic>=2.11.7",
18+
"docling>=2.45.0",
1819
]
1920

2021
[dependency-groups]

0 commit comments

Comments
 (0)