|
| 1 | +""" |
| 2 | +DOCX文档解析器模块 |
| 3 | +
|
| 4 | +该模块提供使用Docling库解析DOCX文档并提取结构化内容的功能。 |
| 5 | +支持标题、段落、列表、表格和图片的识别与输出。 |
| 6 | +""" |
| 7 | + |
| 8 | +import asyncio |
| 9 | +import logging |
| 10 | +import time |
| 11 | + |
| 12 | +from docling.datamodel.base_models import InputFormat |
| 13 | +from docling.document_converter import DocumentConverter, WordFormatOption |
| 14 | +from docling.pipeline.simple_pipeline import SimplePipeline |
| 15 | +from docling_core.types.doc.document import ( |
| 16 | + CodeItem, |
| 17 | + DocItemLabel, |
| 18 | + DoclingDocument, |
| 19 | + FormulaItem, |
| 20 | + ListItem, |
| 21 | + PictureItem, |
| 22 | + SectionHeaderItem, |
| 23 | + TableItem, |
| 24 | + TextItem, |
| 25 | + TitleItem, |
| 26 | +) |
| 27 | + |
| 28 | +from parsers.base_models import ( |
| 29 | + ChunkData, |
| 30 | + ChunkType, |
| 31 | + DocumentData, |
| 32 | + DocumentParser, |
| 33 | + TableDataItem, |
| 34 | +) |
| 35 | + |
| 36 | +logger = logging.getLogger(__name__) |
| 37 | + |
| 38 | + |
| 39 | +class DocxDocumentParser(DocumentParser): |
| 40 | + """DOCX文档解析器 |
| 41 | +
|
| 42 | + 使用Docling的现代解析管道提取DOCX文档的结构化内容。 |
| 43 | + 支持异步解析接口,符合DocumentParser抽象基类。 |
| 44 | + """ |
| 45 | + |
| 46 | + def __init__(self) -> None: |
| 47 | + """初始化解析器""" |
| 48 | + super().__init__() |
| 49 | + self.supported_formats = [".docx"] |
| 50 | + self._converter = DocumentConverter( |
| 51 | + format_options={InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline)}, |
| 52 | + allowed_formats=[InputFormat.DOCX] |
| 53 | + ) |
| 54 | + logger.debug("DocxDocumentParser initialized with SimplePipeline") |
| 55 | + |
| 56 | + def can_parse(self, file_path: str) -> bool: |
| 57 | + """检查是否可以解析该文件 |
| 58 | +
|
| 59 | + Args: |
| 60 | + file_path: 文件路径 |
| 61 | +
|
| 62 | + Returns: |
| 63 | + bool: 是否支持该文件格式 |
| 64 | + """ |
| 65 | + return any(file_path.lower().endswith(fmt) for fmt in self.supported_formats) |
| 66 | + |
| 67 | + async def parse(self, file_path: str) -> DocumentData: |
| 68 | + """异步解析DOCX文件 |
| 69 | +
|
| 70 | + Args: |
| 71 | + file_path: DOCX文件路径 |
| 72 | +
|
| 73 | + Returns: |
| 74 | + ParseResult: 解析结果,包含标题、内容、处理时间和错误信息 |
| 75 | + """ |
| 76 | + start_time = time.time() |
| 77 | + try: |
| 78 | + # 执行同步转换(在异步中运行) |
| 79 | + loop = asyncio.get_event_loop() |
| 80 | + result = await loop.run_in_executor(None, self._converter.convert, file_path) |
| 81 | + doc_data = result.document |
| 82 | + |
| 83 | + # 确保文档数据包含所有必要的属性 |
| 84 | + if not hasattr(doc_data, 'name'): |
| 85 | + doc_data.name = 'Unknown Document' |
| 86 | + if not hasattr(doc_data, 'texts'): |
| 87 | + doc_data.texts = [] |
| 88 | + if not hasattr(doc_data, 'pictures'): |
| 89 | + doc_data.pictures = [] |
| 90 | + if not hasattr(doc_data, 'tables'): |
| 91 | + doc_data.tables = [] |
| 92 | + |
| 93 | + title = self._extract_title(doc_data) |
| 94 | + images = self._extract_images(doc_data.pictures) |
| 95 | + tables = self._extract_tables(doc_data.tables) |
| 96 | + texts = self._extract_texts(doc_data.texts) |
| 97 | + |
| 98 | + processing_time = time.time() - start_time |
| 99 | + logger.info(f"Successfully parsed DOCX: {file_path} (took {processing_time:.2f}s)") |
| 100 | + return DocumentData( |
| 101 | + title=title, |
| 102 | + texts=texts, |
| 103 | + tables=tables, |
| 104 | + images=images, |
| 105 | + processing_time=processing_time, |
| 106 | + success=True |
| 107 | + ) |
| 108 | + |
| 109 | + except Exception as e: |
| 110 | + processing_time = time.time() - start_time |
| 111 | + error_msg = f"Failed to parse DOCX file {file_path}: {type(e).__name__}: {e}" |
| 112 | + logger.exception(error_msg) # 记录完整堆栈 |
| 113 | + return DocumentData( |
| 114 | + success=False, |
| 115 | + error_message=str(e), |
| 116 | + processing_time=processing_time |
| 117 | + ) |
| 118 | + |
| 119 | + def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]: |
| 120 | + """提取文档中的图片 |
| 121 | +
|
| 122 | + Args: |
| 123 | + pictures: 图片列表 |
| 124 | +
|
| 125 | + Returns: |
| 126 | + List[ChunkData]: 图片列表 |
| 127 | + """ |
| 128 | + image_items = [] |
| 129 | + for idx, picture in enumerate(pictures): |
| 130 | + image_uri = "" |
| 131 | + if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'): |
| 132 | + image_uri = str(picture.image.uri) |
| 133 | + |
| 134 | + caption = "" |
| 135 | + if hasattr(picture, 'captions') and picture.captions: |
| 136 | + caption = str(picture.captions[0]) |
| 137 | + |
| 138 | + image_items.append( |
| 139 | + ChunkData( |
| 140 | + type=ChunkType.IMAGE, |
| 141 | + name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}", |
| 142 | + content=image_uri, |
| 143 | + description=caption |
| 144 | + ) |
| 145 | + ) |
| 146 | + |
| 147 | + return image_items |
| 148 | + |
| 149 | + def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]: |
| 150 | + """提取文档中的表格 |
| 151 | +
|
| 152 | + Args: |
| 153 | + tables: 表格列表 |
| 154 | +
|
| 155 | + Returns: |
| 156 | + List[ChunkData]: 表格列表 |
| 157 | + """ |
| 158 | + # 添加安全检查,确保 tables 参数存在且可迭代 |
| 159 | + if not tables or not hasattr(tables, '__iter__'): |
| 160 | + return [] |
| 161 | + |
| 162 | + table_items: list[ChunkData] = [] |
| 163 | + for table in tables: |
| 164 | + if not hasattr(table, 'data') or not hasattr(table.data, 'grid'): |
| 165 | + continue |
| 166 | + if len(table.data.grid) == 0: |
| 167 | + continue |
| 168 | + |
| 169 | + table_cells = table.data.grid |
| 170 | + row_headers = [cell.text for cell in table_cells[0] if cell.row_header] |
| 171 | + column_headers = [cell.text for cell in table_cells[0] if cell.column_header] |
| 172 | + data = [[cell.text for cell in row] for row in table_cells[1:]] |
| 173 | + table_data = TableDataItem( |
| 174 | + rows=table.data.num_rows, |
| 175 | + columns=table.data.num_cols, |
| 176 | + row_headers=row_headers, |
| 177 | + column_headers=column_headers, |
| 178 | + data=data |
| 179 | + ) |
| 180 | + table_items.append( |
| 181 | + ChunkData( |
| 182 | + type=ChunkType.TABLE, |
| 183 | + name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}", |
| 184 | + content=table_data |
| 185 | + ) |
| 186 | + ) |
| 187 | + |
| 188 | + return table_items |
| 189 | + |
| 190 | + def _extract_title(self, doc_data: DoclingDocument) -> str: |
| 191 | + """提取文档中的标题 |
| 192 | + Args: |
| 193 | + doc_data: 文档数据 |
| 194 | + Returns: |
| 195 | + str: 标题 |
| 196 | + """ |
| 197 | + title = "" |
| 198 | + for item in doc_data.texts: |
| 199 | + if hasattr(item, 'label') and item.label == DocItemLabel.TITLE: |
| 200 | + title = item.text |
| 201 | + break |
| 202 | + return title if title else doc_data.name |
| 203 | + |
| 204 | + def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeItem|FormulaItem|TextItem]) -> list[ChunkData]: |
| 205 | + """提取文档中的文本 |
| 206 | +
|
| 207 | + Args: |
| 208 | + text: 文本列表 |
| 209 | +
|
| 210 | + Returns: |
| 211 | + List[ChunkData]: 文本列表 |
| 212 | + """ |
| 213 | + text_items: list[ChunkData] = [] |
| 214 | + |
| 215 | + for item in texts: |
| 216 | + if not hasattr(item, 'label'): |
| 217 | + continue |
| 218 | + if not hasattr(item, 'text') or len(item.text) == 0: |
| 219 | + continue |
| 220 | + match item.label: |
| 221 | + case DocItemLabel.FORMULA: |
| 222 | + text_items.append( |
| 223 | + ChunkData( |
| 224 | + type=ChunkType.FORMULA, |
| 225 | + name=item.self_ref or f"formula-{len(text_items)}", |
| 226 | + content=item.text |
| 227 | + ) |
| 228 | + ) |
| 229 | + case _: |
| 230 | + text_items.append( |
| 231 | + ChunkData( |
| 232 | + type=ChunkType.TEXT, |
| 233 | + name=f"text-{len(text_items)}", |
| 234 | + content=item.text |
| 235 | + ) |
| 236 | + ) |
| 237 | + return text_items |
0 commit comments