1919from openpyxl .workbook .workbook import Workbook # type: ignore
2020from openpyxl .worksheet .worksheet import Worksheet # type: ignore
2121
22- from parsers .base_models import ChunkData , ChunkType , DocumentData , DocumentParser
22+ from parsers .base_models import (
23+ ChunkData ,
24+ ChunkType ,
25+ DocumentData ,
26+ DocumentParser ,
27+ TableDataItem ,
28+ )
2329
2430# 忽略 openpyxl 的特定警告
2531warnings .filterwarnings ('ignore' , category = UserWarning , module = 'openpyxl' )
@@ -57,105 +63,77 @@ def __init__(self, config: ExcelParseConfig | None = None):
5763 self .image_index : int = 0
5864 self .supported_formats : list [str ] = ['.xlsx' , '.xls' ]
5965
60- async def parse (self , excel_path : str ) -> DocumentData :
66+ def can_parse (self , file_path : str ) -> bool :
67+ """
68+ 验证输入文件
69+ Args:
70+ file_path: 文件路径
71+ Returns:
72+ bool: 是否支持解析
6173 """
62- 解析Excel文件并保存结果
74+ return any ( file_path . lower (). endswith ( fmt ) for fmt in self . supported_formats )
6375
76+ async def parse (self , excel_path : str ) -> DocumentData :
77+ """
78+ 将Excel文件转换为JSON格式
6479 Args:
6580 excel_path: Excel文件路径
66- output_dir: 输出目录路径
6781 Returns:
68- ParseResult: 解析结果对象
69- Raises:
70- ExcelParseError: 当解析失败时抛出
82+ DocumentData: 文档数据
7183 """
84+ # 获取文件名作为标题(不带扩展名)
7285 start_time = time .time ()
7386
7487 try :
75- # 转换Excel到JSON格式
76- title , document_data = self ._excel_to_json (excel_path )
77-
78- # 计算处理时间
88+ # 初始化内容列表和图片列表
89+ texts : list [ChunkData ] = []
90+ tables : list [ChunkData ] = []
91+ images : list [ChunkData ] = []
92+
93+ # 加载工作簿
94+ workbook = self ._load_workbook (excel_path )
95+
96+ # 处理每个工作表
97+ for sheet_index , sheet_name in enumerate (workbook .sheetnames ):
98+ sheet = workbook [sheet_name ]
99+
100+ # 添加工作表标题
101+ texts .append (ChunkData (
102+ type = ChunkType .TEXT ,
103+ name = sheet_name ,
104+ content = f"工作表 { sheet_index + 1 } : { sheet_name } " ,
105+ description = "工作表标题"
106+ ))
107+
108+ # 处理图片
109+ sheet_images = self ._extract_sheet_images (sheet )
110+ images .extend (sheet_images )
111+
112+ # 处理表格数据
113+ table_content = self ._extract_table_data (sheet )
114+ tables .append (ChunkData (
115+ type = ChunkType .TABLE ,
116+ name = "表格" ,
117+ content = table_content ,
118+ description = "表格"
119+ ))
79120 processing_time = time .time () - start_time
80-
81-
82121 return DocumentData (
83- title = title ,
84- chunks = document_data ,
122+ title = Path (excel_path ).stem ,
123+ texts = texts ,
124+ tables = tables ,
125+ images = images ,
85126 processing_time = processing_time ,
86127 success = True
87128 )
88-
89129 except Exception as e :
90130 processing_time = time .time () - start_time
91131 return DocumentData (
92132 success = False ,
93- error_message = str (e )
133+ error_message = str (e ),
134+ processing_time = processing_time
94135 )
95136
96- def can_parse (self , file_path : str ) -> bool :
97- """
98- 验证输入文件
99- Args:
100- file_path: 文件路径
101- Returns:
102- bool: 是否支持解析
103- """
104- return any (file_path .lower ().endswith (fmt ) for fmt in self .supported_formats )
105-
106- def _excel_to_json (self , excel_path : str ) -> tuple [str , list [ChunkData ]]:
107- """
108- 将Excel文件转换为JSON格式
109- Args:
110- excel_path: Excel文件路径
111- Returns:
112- DocumentData: 文档数据
113- """
114- # 获取文件名作为标题(不带扩展名)
115- title = Path (excel_path ).stem
116-
117- # 初始化内容列表和图片列表
118- content : list [ChunkData ] = []
119- self .image_index = 0
120-
121- # 加载工作簿
122- workbook = self ._load_workbook (excel_path )
123-
124- # 处理每个工作表
125- for sheet_index , sheet_name in enumerate (workbook .sheetnames ):
126- sheet = workbook [sheet_name ]
127-
128- # 添加工作表标题
129- content .append (ChunkData (
130- type = ChunkType .TEXT ,
131- name = sheet_name ,
132- content = f"工作表 { sheet_index + 1 } : { sheet_name } " ,
133- description = "工作表标题"
134- ))
135-
136- # 处理图片
137- sheet_images = self ._extract_sheet_images (sheet )
138- content .extend (sheet_images )
139-
140- # 处理表格数据
141- table_content = self ._extract_table_data (sheet )
142- content .append (ChunkData (
143- type = ChunkType .TABLE ,
144- name = "表格" ,
145- content = json .dumps (table_content ),
146- description = "表格"
147- ))
148-
149- # 添加结束文本
150- content .append (ChunkData (
151- type = ChunkType .TEXT ,
152- name = "结束文本" ,
153- content = "" ,
154- description = "结束文本"
155- ))
156-
157- return title , content
158-
159137 def _load_workbook (self , excel_path : str ) -> Workbook :
160138 """
161139 加载Excel工作簿
@@ -250,13 +228,13 @@ def _get_image_format(self, img_obj: Image) -> str:
250228 return img_format
251229 return self .config .default_image_format
252230
253- def _process_cell_value (self , cell_value : Any ) -> CellValue :
231+ def _process_cell_value (self , cell_value : Any ) -> str :
254232 """
255233 预处理单元格值,将datetime对象转换为字符串
256234 Args:
257235 cell_value: 原始单元格值
258236 Returns:
259- CellValue : 处理后的单元格值
237+ str : 处理后的单元格值
260238 """
261239 if cell_value is None :
262240 return ""
@@ -269,14 +247,10 @@ def _process_cell_value(self, cell_value: Any) -> CellValue:
269247 if isinstance (cell_value , date ):
270248 return cell_value .strftime ("%Y-%m-%d" )
271249
272- # 处理其他类型
273- if isinstance (cell_value , str | int | float | bool ):
274- return cell_value
275-
276250 # 对于其他类型,转换为字符串
277251 return str (cell_value )
278252
279- def _extract_table_data (self , sheet : Worksheet ) -> dict [ str , Any ] :
253+ def _extract_table_data (self , sheet : Worksheet ) -> TableDataItem :
280254 """
281255 提取表格数据
282256 Args:
@@ -295,16 +269,14 @@ def _extract_table_data(self, sheet: Worksheet) -> dict[str, Any]:
295269 # 提取所有数据
296270 all_rows = self ._extract_all_rows (sheet , max_row , max_col , merged_map )
297271
298- return {
299- "dimensions" : {
300- "rows" : len (all_rows ),
301- "columns" : max_col
302- },
303- "headers" : all_rows [0 ] if all_rows else [],
304- "data" : all_rows [1 :] if len (all_rows ) > 1 else []
305- }
272+ return TableDataItem (
273+ rows = len (all_rows ),
274+ columns = max_col ,
275+ row_headers = all_rows [0 ] if all_rows else [],
276+ data = all_rows [1 :] if len (all_rows ) > 1 else []
277+ )
306278
307- def _get_merged_cells (self , sheet : Worksheet ) -> dict [tuple [int , int , int , int ], CellValue ]:
279+ def _get_merged_cells (self , sheet : Worksheet ) -> dict [tuple [int , int , int , int ], str ]:
308280 """
309281 获取合并单元格信息
310282 Args:
@@ -323,7 +295,7 @@ def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int],
323295 merged_ranges [(min_row , min_col , max_row , max_col )] = merged_value
324296 return merged_ranges
325297
326- def _create_merged_cell_map (self , merged_ranges : dict , sheet : Worksheet ) -> dict [tuple [int , int ], CellValue ]:
298+ def _create_merged_cell_map (self , merged_ranges : dict , sheet : Worksheet ) -> dict [tuple [int , int ], str ]:
327299 """
328300 创建合并单元格映射
329301 Args:
@@ -342,7 +314,7 @@ def _create_merged_cell_map(self, merged_ranges: dict, sheet: Worksheet) -> dict
342314 return merged_map
343315
344316 def _extract_all_rows (self , sheet : Worksheet , max_row : int , max_col : int ,
345- merged_map : dict [tuple [int , int ], CellValue ]) -> TableData :
317+ merged_map : dict [tuple [int , int ], str ]) -> list [ list [ str ]] :
346318 """
347319 提取所有行数据
348320 Args:
0 commit comments