Skip to content

Commit 940ef02

Browse files
committed
fix: refine document data model
1 parent da016a1 commit 940ef02

File tree

4 files changed

+113
-123
lines changed

4 files changed

+113
-123
lines changed

parsers/base_models.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,42 @@
11
import logging
22
from abc import ABC, abstractmethod
33
from enum import Enum
4+
from typing import Any
45

56
from pydantic import BaseModel
67

78
logger = logging.getLogger(__name__)
89

10+
911
class ChunkType(str, Enum):
1012
"""块类型"""
1113
TEXT = "text"
1214
IMAGE = "image"
1315
TABLE = "table"
1416
FORMULA = "formula"
1517

18+
class TableDataItem(BaseModel):
19+
"""表格数据类"""
20+
rows: int # 行数
21+
columns: int # 列数
22+
row_headers: list[Any] = [] # 行头
23+
column_headers: list[Any] = [] # 列头
24+
data: list[list[str]] = [] # 数据
25+
1626
class ChunkData(BaseModel):
1727
"""块数据类"""
1828
type: ChunkType
1929
name: str
20-
content: str = ""
30+
content: str|TableDataItem = ""
2131
description: str = ""
2232

2333
class DocumentData(BaseModel):
2434
"""解析结果类"""
2535
title: str = ""
26-
chunks: list[ChunkData] = []
36+
texts: list[ChunkData] = []
37+
tables: list[ChunkData] = []
38+
images: list[ChunkData] = []
39+
formulas: list[ChunkData] = []
2740
processing_time: float = 0
2841
success: bool
2942
error_message: str | None = None

parsers/excel_parser.py

Lines changed: 70 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,13 @@
1919
from openpyxl.workbook.workbook import Workbook # type: ignore
2020
from openpyxl.worksheet.worksheet import Worksheet # type: ignore
2121

22-
from parsers.base_models import ChunkData, ChunkType, DocumentData, DocumentParser
22+
from parsers.base_models import (
23+
ChunkData,
24+
ChunkType,
25+
DocumentData,
26+
DocumentParser,
27+
TableDataItem,
28+
)
2329

2430
# 忽略 openpyxl 的特定警告
2531
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
@@ -57,105 +63,77 @@ def __init__(self, config: ExcelParseConfig | None = None):
5763
self.image_index: int = 0
5864
self.supported_formats: list[str] = ['.xlsx', '.xls']
5965

60-
async def parse(self, excel_path: str) -> DocumentData:
66+
def can_parse(self, file_path: str) -> bool:
67+
"""
68+
验证输入文件
69+
Args:
70+
file_path: 文件路径
71+
Returns:
72+
bool: 是否支持解析
6173
"""
62-
解析Excel文件并保存结果
74+
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
6375

76+
async def parse(self, excel_path: str) -> DocumentData:
77+
"""
78+
将Excel文件转换为JSON格式
6479
Args:
6580
excel_path: Excel文件路径
66-
output_dir: 输出目录路径
6781
Returns:
68-
ParseResult: 解析结果对象
69-
Raises:
70-
ExcelParseError: 当解析失败时抛出
82+
DocumentData: 文档数据
7183
"""
84+
# 获取文件名作为标题(不带扩展名)
7285
start_time = time.time()
7386

7487
try:
75-
# 转换Excel到JSON格式
76-
title, document_data = self._excel_to_json(excel_path)
77-
78-
# 计算处理时间
88+
# 初始化内容列表和图片列表
89+
texts: list[ChunkData] = []
90+
tables: list[ChunkData] = []
91+
images: list[ChunkData] = []
92+
93+
# 加载工作簿
94+
workbook = self._load_workbook(excel_path)
95+
96+
# 处理每个工作表
97+
for sheet_index, sheet_name in enumerate(workbook.sheetnames):
98+
sheet = workbook[sheet_name]
99+
100+
# 添加工作表标题
101+
texts.append(ChunkData(
102+
type=ChunkType.TEXT,
103+
name=sheet_name,
104+
content=f"工作表 {sheet_index + 1}: {sheet_name}",
105+
description="工作表标题"
106+
))
107+
108+
# 处理图片
109+
sheet_images = self._extract_sheet_images(sheet)
110+
images.extend(sheet_images)
111+
112+
# 处理表格数据
113+
table_content = self._extract_table_data(sheet)
114+
tables.append(ChunkData(
115+
type=ChunkType.TABLE,
116+
name="表格",
117+
content=table_content,
118+
description="表格"
119+
))
79120
processing_time = time.time() - start_time
80-
81-
82121
return DocumentData(
83-
title=title,
84-
chunks=document_data,
122+
title=Path(excel_path).stem,
123+
texts=texts,
124+
tables=tables,
125+
images=images,
85126
processing_time=processing_time,
86127
success=True
87128
)
88-
89129
except Exception as e:
90130
processing_time = time.time() - start_time
91131
return DocumentData(
92132
success=False,
93-
error_message=str(e)
133+
error_message=str(e),
134+
processing_time=processing_time
94135
)
95136

96-
def can_parse(self, file_path: str) -> bool:
97-
"""
98-
验证输入文件
99-
Args:
100-
file_path: 文件路径
101-
Returns:
102-
bool: 是否支持解析
103-
"""
104-
return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats)
105-
106-
def _excel_to_json(self, excel_path: str) -> tuple[str, list[ChunkData]]:
107-
"""
108-
将Excel文件转换为JSON格式
109-
Args:
110-
excel_path: Excel文件路径
111-
Returns:
112-
DocumentData: 文档数据
113-
"""
114-
# 获取文件名作为标题(不带扩展名)
115-
title = Path(excel_path).stem
116-
117-
# 初始化内容列表和图片列表
118-
content: list[ChunkData] = []
119-
self.image_index = 0
120-
121-
# 加载工作簿
122-
workbook = self._load_workbook(excel_path)
123-
124-
# 处理每个工作表
125-
for sheet_index, sheet_name in enumerate(workbook.sheetnames):
126-
sheet = workbook[sheet_name]
127-
128-
# 添加工作表标题
129-
content.append(ChunkData(
130-
type=ChunkType.TEXT,
131-
name=sheet_name,
132-
content=f"工作表 {sheet_index + 1}: {sheet_name}",
133-
description="工作表标题"
134-
))
135-
136-
# 处理图片
137-
sheet_images = self._extract_sheet_images(sheet)
138-
content.extend(sheet_images)
139-
140-
# 处理表格数据
141-
table_content = self._extract_table_data(sheet)
142-
content.append(ChunkData(
143-
type=ChunkType.TABLE,
144-
name="表格",
145-
content=json.dumps(table_content),
146-
description="表格"
147-
))
148-
149-
# 添加结束文本
150-
content.append(ChunkData(
151-
type=ChunkType.TEXT,
152-
name="结束文本",
153-
content="",
154-
description="结束文本"
155-
))
156-
157-
return title, content
158-
159137
def _load_workbook(self, excel_path: str) -> Workbook:
160138
"""
161139
加载Excel工作簿
@@ -250,13 +228,13 @@ def _get_image_format(self, img_obj: Image) -> str:
250228
return img_format
251229
return self.config.default_image_format
252230

253-
def _process_cell_value(self, cell_value: Any) -> CellValue:
231+
def _process_cell_value(self, cell_value: Any) -> str:
254232
"""
255233
预处理单元格值,将datetime对象转换为字符串
256234
Args:
257235
cell_value: 原始单元格值
258236
Returns:
259-
CellValue: 处理后的单元格值
237+
str: 处理后的单元格值
260238
"""
261239
if cell_value is None:
262240
return ""
@@ -269,14 +247,10 @@ def _process_cell_value(self, cell_value: Any) -> CellValue:
269247
if isinstance(cell_value, date):
270248
return cell_value.strftime("%Y-%m-%d")
271249

272-
# 处理其他类型
273-
if isinstance(cell_value, str|int|float|bool):
274-
return cell_value
275-
276250
# 对于其他类型,转换为字符串
277251
return str(cell_value)
278252

279-
def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]:
253+
def _extract_table_data(self, sheet: Worksheet) -> TableDataItem:
280254
"""
281255
提取表格数据
282256
Args:
@@ -295,16 +269,14 @@ def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]:
295269
# 提取所有数据
296270
all_rows = self._extract_all_rows(sheet, max_row, max_col, merged_map)
297271

298-
return {
299-
"dimensions": {
300-
"rows": len(all_rows),
301-
"columns": max_col
302-
},
303-
"headers": all_rows[0] if all_rows else [],
304-
"data": all_rows[1:] if len(all_rows) > 1 else []
305-
}
272+
return TableDataItem(
273+
rows=len(all_rows),
274+
columns=max_col,
275+
row_headers=all_rows[0] if all_rows else [],
276+
data=all_rows[1:] if len(all_rows) > 1 else []
277+
)
306278

307-
def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], CellValue]:
279+
def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], str]:
308280
"""
309281
获取合并单元格信息
310282
Args:
@@ -323,7 +295,7 @@ def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int],
323295
merged_ranges[(min_row, min_col, max_row, max_col)] = merged_value
324296
return merged_ranges
325297

326-
def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], CellValue]:
298+
def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict[tuple[int, int], str]:
327299
"""
328300
创建合并单元格映射
329301
Args:
@@ -342,7 +314,7 @@ def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict
342314
return merged_map
343315

344316
def _extract_all_rows(self, sheet: Worksheet, max_row: int, max_col: int,
345-
merged_map: dict[tuple[int, int], CellValue]) -> TableData:
317+
merged_map: dict[tuple[int, int], str]) -> list[list[str]]:
346318
"""
347319
提取所有行数据
348320
Args:

tests/test_excel_parser.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,22 @@ async def test_parse_real_basic_and_image():
4848
result = await parser.parse(xlsx_path)
4949

5050
assert result.success is True
51-
# 内容:Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格、结束文本
52-
content = result.chunks
53-
assert len(content) == 6
51+
# 内容:Sheet1标题、Sheet1图片、Sheet1表格、Sheet2标题、Sheet2表格
52+
content = result.tables
53+
assert len(content) == 2
54+
55+
assert len(result.images) == 1
56+
assert len(result.texts) == 2
5457

5558
# 校验顺序与关键字段
56-
assert content[0].type == "text" and content[0].name == "Sheet1"
57-
assert content[1].type == "image"
58-
assert content[1].name == "#/pictures/0"
59-
assert content[1].content.startswith("data:image/")
60-
61-
assert content[2].type == "table"
62-
assert content[3].type == "text" and content[3].name == "Sheet2"
63-
assert content[4].type == "table"
64-
assert content[5].type == "text" and content[5].name == "结束文本"
59+
assert result.texts[0].type == "text" and result.texts[0].name == "Sheet1"
60+
assert result.images[0].type == "image"
61+
assert result.images[0].name == "#/pictures/0"
62+
assert result.images[0].content.startswith("data:image/")
63+
64+
assert result.tables[0].type == "table"
65+
assert result.texts[1].type == "text" and result.texts[1].name == "Sheet2"
66+
assert result.tables[1].type == "table"
6567
finally:
6668
os.remove(xlsx_path)
6769
finally:
@@ -91,19 +93,19 @@ async def test_parse_real_merged_cells():
9193
result = await parser.parse(xlsx_path)
9294

9395
assert result.success is True
94-
content = result.chunks
95-
# 结构:标题、表格、结束文本
96-
assert len(content) == 3
96+
# 结构:标题、表格
97+
assert len(result.tables) == 1
98+
assert len(result.texts) == 1
9799

98100
# 表格在索引1
99-
table_chunk: ChunkData = content[1]
101+
table_chunk: ChunkData = result.tables[0]
100102
assert table_chunk.type == "table"
101103

102-
import json as _json
103-
payload = _json.loads(table_chunk.content)
104-
assert payload["headers"] == ["Merged Header", "Merged Header"]
105-
assert payload["dimensions"]["rows"] == 2
106-
assert payload["dimensions"]["columns"] == 2
104+
payload = table_chunk.content
105+
assert payload.row_headers == ["Merged Header", "Merged Header"]
106+
assert payload.data == [["Value1", "Value2"]]
107+
assert payload.rows == 2
108+
assert payload.columns == 2
107109
finally:
108110
os.remove(xlsx_path)
109111

worker.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ async def worker(app: Sanic) -> dict[str, Any]:
2222
parse_result = await parser_factory.parse_document(file_path)
2323
if not parse_result.success:
2424
continue
25-
chunk_list = parse_result.chunks
25+
chunk_list = parse_result.texts + parse_result.tables + parse_result.images + parse_result.formulas
2626
# 控制并发数量,防止访问量过大导致失败
2727
SEMAPHORE_LIMIT = 10 # 可根据实际情况调整
2828
semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)
@@ -35,5 +35,8 @@ async def enhance_with_semaphore(chunk: ChunkData, semaphore: asyncio.Semaphore)
3535
enhanced_chunk_list = await asyncio.gather(
3636
*(enhance_with_semaphore(chunk, semaphore) for chunk in chunk_list)
3737
)
38-
parse_result.chunks = enhanced_chunk_list
38+
parse_result.texts = enhanced_chunk_list[:len(parse_result.texts)]
39+
parse_result.tables = enhanced_chunk_list[len(parse_result.texts):len(parse_result.texts) + len(parse_result.tables)]
40+
parse_result.images = enhanced_chunk_list[len(parse_result.texts) + len(parse_result.tables):len(parse_result.texts) + len(parse_result.tables) + len(parse_result.images)]
41+
parse_result.formulas = enhanced_chunk_list[len(parse_result.texts) + len(parse_result.tables) + len(parse_result.images):]
3942
return parse_result.model_dump(mode="json")

0 commit comments

Comments
 (0)