Skip to content

Commit 34417ae

Browse files
committed
feat: add pdf parser
1 parent 2ee14a8 commit 34417ae

13 files changed

+2288
-172
lines changed

.env.template

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,7 @@ TASK_TIMEOUT=3600
4848
# 调试模式
4949
DEBUG=false
5050
# 日志级别
51-
LOG_LEVEL=INFO
51+
LOG_LEVEL=INFO
52+
53+
# PDF解析
54+
MINERU_MODEL_SOURCE=local

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,5 @@ cython_debug/
174174
# PyPI configuration file
175175
.pypirc
176176

177-
examples/
177+
examples/
178+
models/

mineru.json

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{
2+
"bucket_info": {
3+
"bucket-name-1": [
4+
"ak",
5+
"sk",
6+
"endpoint"
7+
],
8+
"bucket-name-2": [
9+
"ak",
10+
"sk",
11+
"endpoint"
12+
]
13+
},
14+
"latex-delimiter-config": {
15+
"display": {
16+
"left": "$$",
17+
"right": "$$"
18+
},
19+
"inline": {
20+
"left": "$",
21+
"right": "$"
22+
}
23+
},
24+
"llm-aided-config": {
25+
"title_aided": {
26+
"api_key": "your_api_key",
27+
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
28+
"model": "qwen2.5-32b-instruct",
29+
"enable": false
30+
}
31+
},
32+
"models-dir": {
33+
"pipeline": "models",
34+
"vlm": ""
35+
},
36+
"config_version": "1.3.0"
37+
}

parsers/base_models.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from abc import ABC, abstractmethod
33
from enum import Enum
44
from typing import Any
5-
5+
from pathlib import Path
66
from pydantic import BaseModel, Field
77

88
logger = logging.getLogger(__name__)
@@ -19,15 +19,34 @@ class TableDataItem(BaseModel):
1919
"""表格数据类"""
2020
rows: int # 行数
2121
columns: int # 列数
22+
grid: list[list[str]] = Field(default_factory=list) # 网格数据
2223
row_headers: list[Any] = Field(default_factory=list) # 行头
2324
column_headers: list[Any] = Field(default_factory=list) # 列头
2425
data: list[list[str]] = Field(default_factory=list) # 数据
26+
caption: list[str] = Field(default_factory=list) # 表格标题
27+
footnote: list[str] = Field(default_factory=list) # 表格注脚
28+
29+
class TextDataItem(BaseModel):
30+
"""文本数据类"""
31+
text: str # 文本
32+
text_level: int|None = None # 文本级别
33+
34+
class ImageDataItem(BaseModel):
35+
"""图片数据类"""
36+
uri: str|None = None # 图片 URI
37+
caption: list[str] = Field(default_factory=list) # 图片标题
38+
footnote: list[str] = Field(default_factory=list) # 图片注脚
39+
40+
class FormulaDataItem(BaseModel):
41+
"""公式数据类"""
42+
text: str # 公式
43+
text_format: str|None = None # 公式格式
2544

2645
class ChunkData(BaseModel):
2746
"""块数据类"""
2847
type: ChunkType
2948
name: str|None = None
30-
content: str|TableDataItem|None = None
49+
content: TableDataItem|TextDataItem|ImageDataItem|FormulaDataItem
3150
description: str|None = None
3251

3352
class DocumentData(BaseModel):
@@ -45,6 +64,6 @@ class DocumentParser(ABC):
4564
"""文档解析器基类"""
4665

4766
@abstractmethod
48-
async def parse(self, file_path: str) -> DocumentData:
67+
async def parse(self, file_path: Path) -> DocumentData:
4968
"""解析文档"""
5069
pass

parsers/docx_parser.py

Lines changed: 29 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import asyncio
99
import logging
1010
import time
11-
11+
from pathlib import Path
1212
from docling.datamodel.base_models import InputFormat
1313
from docling.document_converter import DocumentConverter, WordFormatOption
1414
from docling.pipeline.simple_pipeline import SimplePipeline
@@ -31,6 +31,9 @@
3131
DocumentData,
3232
DocumentParser,
3333
TableDataItem,
34+
ImageDataItem,
35+
TextDataItem,
36+
FormulaDataItem
3437
)
3538
from parsers.parser_registry import register_parser
3639

@@ -54,7 +57,7 @@ def __init__(self) -> None:
5457
)
5558
logger.debug("DocxDocumentParser initialized with SimplePipeline")
5659

57-
async def parse(self, file_path: str) -> DocumentData:
60+
async def parse(self, file_path: Path) -> DocumentData:
5861
"""异步解析DOCX文件
5962
6063
Args:
@@ -70,16 +73,6 @@ async def parse(self, file_path: str) -> DocumentData:
7073
result = await loop.run_in_executor(None, self._converter.convert, file_path)
7174
doc_data = result.document
7275

73-
# 确保文档数据包含所有必要的属性
74-
if not hasattr(doc_data, 'name'):
75-
doc_data.name = 'Unknown Document'
76-
if not hasattr(doc_data, 'texts'):
77-
doc_data.texts = []
78-
if not hasattr(doc_data, 'pictures'):
79-
doc_data.pictures = []
80-
if not hasattr(doc_data, 'tables'):
81-
doc_data.tables = []
82-
8376
title = self._extract_title(doc_data)
8477
images = self._extract_images(doc_data.pictures)
8578
tables = self._extract_tables(doc_data.tables)
@@ -117,20 +110,18 @@ def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
117110
"""
118111
image_items = []
119112
for idx, picture in enumerate(pictures):
120-
image_uri = ""
121-
if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'):
122-
image_uri = str(picture.image.uri)
123-
124-
caption = ""
125-
if hasattr(picture, 'captions') and picture.captions:
126-
caption = str(picture.captions[0])
127-
113+
image_uri = str(picture.image.uri)
114+
caption = [caption.cref for caption in picture.captions]
115+
footnote = [footnote.cref for footnote in picture.footnotes]
128116
image_items.append(
129117
ChunkData(
130118
type=ChunkType.IMAGE,
131-
name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}",
132-
content=image_uri,
133-
description=caption
119+
name=f"#/pictures/{idx}",
120+
content=ImageDataItem(
121+
uri=image_uri,
122+
caption=caption,
123+
footnote=footnote
124+
)
134125
)
135126
)
136127

@@ -145,32 +136,22 @@ def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
145136
Returns:
146137
List[ChunkData]: 表格列表
147138
"""
148-
# 添加安全检查,确保 tables 参数存在且可迭代
149-
if not tables or not hasattr(tables, '__iter__'):
150-
return []
151-
152139
table_items: list[ChunkData] = []
153140
for table in tables:
154-
if not hasattr(table, 'data') or not hasattr(table.data, 'grid'):
155-
continue
156-
if len(table.data.grid) == 0:
157-
continue
158-
159-
table_cells = table.data.grid
160-
row_headers = [cell.text for cell in table_cells[0] if cell.row_header]
161-
column_headers = [cell.text for cell in table_cells[0] if cell.column_header]
162-
data = [[cell.text for cell in row] for row in table_cells[1:]]
141+
caption = [caption.cref for caption in table.captions]
142+
footnote = [footnote.cref for footnote in table.footnotes]
143+
grid = [[cell.text if cell.text else '' for cell in row] for row in table.data.grid]
163144
table_data = TableDataItem(
164145
rows=table.data.num_rows,
165146
columns=table.data.num_cols,
166-
row_headers=row_headers,
167-
column_headers=column_headers,
168-
data=data
147+
grid=grid,
148+
caption=caption,
149+
footnote=footnote
169150
)
170151
table_items.append(
171152
ChunkData(
172153
type=ChunkType.TABLE,
173-
name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}",
154+
name=f"#/tables/{len(table_items)}",
174155
content=table_data
175156
)
176157
)
@@ -212,16 +193,20 @@ def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeIte
212193
text_items.append(
213194
ChunkData(
214195
type=ChunkType.FORMULA,
215-
name=item.self_ref or f"formula-{len(text_items)}",
216-
content=item.text
196+
name=f"formula-{len(text_items)}",
197+
content=FormulaDataItem(
198+
text=item.text
199+
)
217200
)
218201
)
219202
case _:
220203
text_items.append(
221204
ChunkData(
222205
type=ChunkType.TEXT,
223-
name=f"text-{len(text_items)}",
224-
content=item.text
206+
name=f"#/texts/{len(text_items)}",
207+
content=TextDataItem(
208+
text=item.text
209+
)
225210
)
226211
)
227212
return text_items

parsers/excel_parser.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
DocumentData,
2626
DocumentParser,
2727
TableDataItem,
28+
TextDataItem,
29+
ImageDataItem
2830
)
2931
from parsers.parser_registry import register_parser
3032

@@ -64,7 +66,7 @@ def __init__(self, config: ExcelParseConfig | None = None):
6466
self.config: ExcelParseConfig = config or ExcelParseConfig()
6567
self.image_index: int = 0
6668

67-
async def parse(self, file_path: str) -> DocumentData:
69+
async def parse(self, file_path: Path) -> DocumentData:
6870
"""
6971
将Excel文件转换为JSON格式
7072
Args:
@@ -92,8 +94,9 @@ async def parse(self, file_path: str) -> DocumentData:
9294
texts.append(ChunkData(
9395
type=ChunkType.TEXT,
9496
name=sheet_name,
95-
content=f"工作表 {sheet_index + 1}: {sheet_name}",
96-
description="工作表标题"
97+
content=TextDataItem(
98+
text=f"工作表 {sheet_index + 1}: {sheet_name}",
99+
),
97100
))
98101

99102
# 处理图片
@@ -104,9 +107,8 @@ async def parse(self, file_path: str) -> DocumentData:
104107
table_content = self._extract_table_data(sheet)
105108
tables.append(ChunkData(
106109
type=ChunkType.TABLE,
107-
name="表格",
108-
content=table_content,
109-
description="表格"
110+
name=f"#/tables/{sheet_index}",
111+
content=table_content
110112
))
111113
processing_time = time.time() - start_time
112114
return DocumentData(
@@ -125,7 +127,7 @@ async def parse(self, file_path: str) -> DocumentData:
125127
processing_time=processing_time
126128
)
127129

128-
def _load_workbook(self, excel_path: str) -> Workbook:
130+
def _load_workbook(self, excel_path: Path) -> Workbook:
129131
"""
130132
加载Excel工作簿
131133
Args:
@@ -191,8 +193,9 @@ def _process_image_object(self, img_obj: Image) -> ChunkData | None:
191193
image_info = ChunkData(
192194
type=ChunkType.IMAGE,
193195
name=f"#/pictures/{self.image_index}",
194-
content=uri,
195-
description=self.config.image_description_placeholder
196+
content=ImageDataItem(
197+
uri=uri
198+
)
196199
)
197200

198201
self.image_index += 1
@@ -263,8 +266,7 @@ def _extract_table_data(self, sheet: Worksheet) -> TableDataItem:
263266
return TableDataItem(
264267
rows=len(all_rows),
265268
columns=max_col,
266-
row_headers=all_rows[0] if all_rows else [],
267-
data=all_rows[1:] if len(all_rows) > 1 else []
269+
grid=all_rows
268270
)
269271

270272
def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], str]:

0 commit comments

Comments
 (0)