|
5 | 5 | 包括表格数据提取和图片处理。
|
6 | 6 | """
|
7 | 7 |
|
| 8 | +import asyncio |
8 | 9 | import base64
|
9 | 10 | import json
|
| 11 | +import logging |
10 | 12 | import time
|
11 | 13 | import warnings
|
12 | 14 | from dataclasses import dataclass
|
|
37 | 39 | CellValue = str|int|float|bool|None|datetime|date
|
38 | 40 | TableData = list[list[CellValue]]
|
39 | 41 |
|
| 42 | +logger = logging.getLogger(__name__) |
40 | 43 |
|
41 | 44 | @dataclass
|
42 | 45 | class ExcelParseConfig:
|
@@ -79,53 +82,88 @@ async def parse(self, file_path: Path) -> DocumentData:
|
79 | 82 |
|
80 | 83 | try:
|
81 | 84 | # 初始化内容列表和图片列表
|
82 |
| - texts: list[ChunkData] = [] |
83 |
| - tables: list[ChunkData] = [] |
84 |
| - images: list[ChunkData] = [] |
85 | 85 |
|
86 | 86 | # 加载工作簿
|
87 | 87 | workbook = self._load_workbook(file_path)
|
88 | 88 |
|
89 |
| - # 处理每个工作表 |
90 |
| - for sheet_index, sheet_name in enumerate(workbook.sheetnames): |
91 |
| - sheet = workbook[sheet_name] |
92 |
| - |
93 |
| - # 添加工作表标题 |
94 |
| - texts.append(ChunkData( |
95 |
| - type=ChunkType.TEXT, |
96 |
| - name=sheet_name, |
97 |
| - content=TextDataItem( |
98 |
| - text=f"工作表 {sheet_index + 1}: {sheet_name}", |
99 |
| - ), |
100 |
| - )) |
101 |
| - |
102 |
| - # 处理图片 |
103 |
| - sheet_images = self._extract_sheet_images(sheet) |
104 |
| - images.extend(sheet_images) |
105 |
| - |
106 |
| - # 处理表格数据 |
107 |
| - table_content = self._extract_table_data(sheet) |
108 |
| - tables.append(ChunkData( |
109 |
| - type=ChunkType.TABLE, |
110 |
| - name=f"#/tables/{sheet_index}", |
111 |
| - content=table_content |
112 |
| - )) |
| 89 | + # 并行处理每个工作表 |
| 90 | + document_data = await self._process_sheets_parallel(workbook, file_path) |
| 91 | + |
113 | 92 | processing_time = time.time() - start_time
|
114 |
| - return DocumentData( |
115 |
| - title=Path(file_path).stem, |
116 |
| - texts=texts, |
117 |
| - tables=tables, |
118 |
| - images=images, |
119 |
| - processing_time=processing_time, |
120 |
| - success=True |
121 |
| - ) |
| 93 | + document_data.processing_time = processing_time |
| 94 | + return document_data |
122 | 95 | except Exception as e:
|
123 |
| - processing_time = time.time() - start_time |
124 |
| - return DocumentData( |
125 |
| - success=False, |
126 |
| - error_message=str(e), |
127 |
| - processing_time=processing_time |
128 |
| - ) |
| 96 | + raise Exception(f"Failed to parse Excel file {file_path}: {type(e).__name__}: {e}") from e |
| 97 | + |
| 98 | + async def _process_sheets_parallel(self, workbook: Workbook, file_path: Path) -> DocumentData: |
| 99 | + """并行处理所有工作表""" |
| 100 | + # 创建任务列表 |
| 101 | + tasks = [] |
| 102 | + |
| 103 | + for sheet_index, sheet_name in enumerate(workbook.sheetnames): |
| 104 | + sheet = workbook[sheet_name] |
| 105 | + tasks.append(self._process_sheet_async(sheet, sheet_index, sheet_name)) |
| 106 | + |
| 107 | + # 并行执行所有工作表处理任务 |
| 108 | + if tasks: |
| 109 | + results = await asyncio.gather(*tasks) |
| 110 | + |
| 111 | + # 合并结果 |
| 112 | + texts: list[ChunkData] = [] |
| 113 | + tables: list[ChunkData] = [] |
| 114 | + images: list[ChunkData] = [] |
| 115 | + |
| 116 | + for result in results: |
| 117 | + if result: |
| 118 | + texts.extend(result.get('texts', [])) |
| 119 | + tables.extend(result.get('tables', [])) |
| 120 | + images.extend(result.get('images', [])) |
| 121 | + |
| 122 | + return DocumentData( |
| 123 | + title=Path(file_path).stem, |
| 124 | + texts=texts, |
| 125 | + tables=tables, |
| 126 | + images=images, |
| 127 | + success=True |
| 128 | + ) |
| 129 | + |
| 130 | + async def _process_sheet_async(self, sheet: Worksheet, sheet_index: int, sheet_name: str) -> dict|None: |
| 131 | + """异步处理单个工作表""" |
| 132 | + try: |
| 133 | + loop = asyncio.get_event_loop() |
| 134 | + |
| 135 | + # 并行处理图片和表格 |
| 136 | + image_task = loop.run_in_executor(None, self._extract_sheet_images, sheet) |
| 137 | + table_task = loop.run_in_executor(None, self._extract_table_data, sheet) |
| 138 | + |
| 139 | + # 等待两个任务完成 |
| 140 | + sheet_images, table_content = await asyncio.gather(image_task, table_task) |
| 141 | + |
| 142 | + # 添加工作表标题 |
| 143 | + texts = [ChunkData( |
| 144 | + type=ChunkType.TEXT, |
| 145 | + name=sheet_name, |
| 146 | + content=TextDataItem( |
| 147 | + text=f"工作表 {sheet_index + 1}: {sheet_name}", |
| 148 | + ), |
| 149 | + )] |
| 150 | + |
| 151 | + # 创建表格数据 |
| 152 | + tables = [ChunkData( |
| 153 | + type=ChunkType.TABLE, |
| 154 | + name=f"#/tables/{sheet_index}", |
| 155 | + content=table_content |
| 156 | + )] if table_content else [] |
| 157 | + |
| 158 | + return { |
| 159 | + 'texts': texts, |
| 160 | + 'tables': tables, |
| 161 | + 'images': sheet_images |
| 162 | + } |
| 163 | + |
| 164 | + except Exception as e: |
| 165 | + logger.error(f"Error processing sheet {sheet_name}: {e}") |
| 166 | + return None |
129 | 167 |
|
130 | 168 | def _load_workbook(self, excel_path: Path) -> Workbook:
|
131 | 169 | """
|
|
0 commit comments