USTC-KnowledgeComputingLab
diff --git a/‎.env.template‎
Lines changed: 4 additions & 1 deletion b/‎.env.template‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mineru.json‎
Lines changed: 37 additions & 0 deletions b/‎mineru.json‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎parsers/base_models.py‎
Lines changed: 22 additions & 3 deletions b/‎parsers/base_models.py‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎parsers/docx_parser.py‎
Lines changed: 29 additions & 44 deletions b/‎parsers/docx_parser.py‎
Lines changed: 29 additions & 44 deletions
diff --git a/‎parsers/excel_parser.py‎
Lines changed: 13 additions & 11 deletions b/‎parsers/excel_parser.py‎
Lines changed: 13 additions & 11 deletions
@@ -48,4 +48,7 @@ TASK_TIMEOUT=3600
 # 调试模式
 DEBUG=false
 # 日志级别
-LOG_LEVEL=INFO
+LOG_LEVEL=INFO
+
+# PDF解析
+MINERU_MODEL_SOURCE=local
@@ -174,4 +174,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
-examples/
+examples/
+models/
@@ -0,0 +1,37 @@
+{
+    "bucket_info": {
+        "bucket-name-1": [
+            "ak",
+            "sk",
+            "endpoint"
+        ],
+        "bucket-name-2": [
+            "ak",
+            "sk",
+            "endpoint"
+        ]
+    },
+    "latex-delimiter-config": {
+        "display": {
+            "left": "$$",
+            "right": "$$"
+        },
+        "inline": {
+            "left": "$",
+            "right": "$"
+        }
+    },
+    "llm-aided-config": {
+        "title_aided": {
+            "api_key": "your_api_key",
+            "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+            "model": "qwen2.5-32b-instruct",
+            "enable": false
+        }
+    },
+    "models-dir": {
+        "pipeline": "models",
+        "vlm": ""
+    },
+    "config_version": "1.3.0"
+}
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any
-
+from pathlib import Path
 from pydantic import BaseModel, Field
 
 logger = logging.getLogger(__name__)
@@ -19,15 +19,34 @@ class TableDataItem(BaseModel):
     """表格数据类"""
     rows: int  # 行数
     columns: int  # 列数
+    grid: list[list[str]] = Field(default_factory=list)  # 网格数据
     row_headers: list[Any] = Field(default_factory=list)  # 行头
     column_headers: list[Any] = Field(default_factory=list)  # 列头
     data: list[list[str]] = Field(default_factory=list)  # 数据
+    caption: list[str] = Field(default_factory=list)  # 表格标题
+    footnote: list[str] = Field(default_factory=list)  # 表格注脚
+
+class TextDataItem(BaseModel):
+    """文本数据类"""
+    text: str  # 文本
+    text_level: int|None = None  # 文本级别
+
+class ImageDataItem(BaseModel):
+    """图片数据类"""
+    uri: str|None = None  # 图片 URI
+    caption: list[str] = Field(default_factory=list)  # 图片标题
+    footnote: list[str] = Field(default_factory=list)  # 图片注脚
+
+class FormulaDataItem(BaseModel):
+    """公式数据类"""
+    text: str  # 公式
+    text_format: str|None = None  # 公式格式
 
 class ChunkData(BaseModel):
     """块数据类"""
     type: ChunkType
     name: str|None = None
-    content: str|TableDataItem|None = None
+    content: TableDataItem|TextDataItem|ImageDataItem|FormulaDataItem
     description: str|None = None
 
 class DocumentData(BaseModel):
@@ -45,6 +64,6 @@ class DocumentParser(ABC):
     """文档解析器基类"""
 
     @abstractmethod
-    async def parse(self, file_path: str) -> DocumentData:
+    async def parse(self, file_path: Path) -> DocumentData:
         """解析文档"""
         pass
@@ -8,7 +8,7 @@
 import asyncio
 import logging
 import time
-
+from pathlib import Path
 from docling.datamodel.base_models import InputFormat
 from docling.document_converter import DocumentConverter, WordFormatOption
 from docling.pipeline.simple_pipeline import SimplePipeline
@@ -31,6 +31,9 @@
     DocumentData,
     DocumentParser,
     TableDataItem,
+    ImageDataItem,
+    TextDataItem,
+    FormulaDataItem
 )
 from parsers.parser_registry import register_parser
 
@@ -54,7 +57,7 @@ def __init__(self) -> None:
         )
         logger.debug("DocxDocumentParser initialized with SimplePipeline")
 
-    async def parse(self, file_path: str) -> DocumentData:
+    async def parse(self, file_path: Path) -> DocumentData:
         """异步解析DOCX文件
 
         Args:
@@ -70,16 +73,6 @@ async def parse(self, file_path: str) -> DocumentData:
             result = await loop.run_in_executor(None, self._converter.convert, file_path)
             doc_data = result.document
 
-            # 确保文档数据包含所有必要的属性
-            if not hasattr(doc_data, 'name'):
-                doc_data.name = 'Unknown Document'
-            if not hasattr(doc_data, 'texts'):
-                doc_data.texts = []
-            if not hasattr(doc_data, 'pictures'):
-                doc_data.pictures = []
-            if not hasattr(doc_data, 'tables'):
-                doc_data.tables = []
-
             title = self._extract_title(doc_data)
             images = self._extract_images(doc_data.pictures)
             tables = self._extract_tables(doc_data.tables)
@@ -117,20 +110,18 @@ def _extract_images(self, pictures: list[PictureItem]) -> list[ChunkData]:
         """
         image_items = []
         for idx, picture in enumerate(pictures):
-            image_uri = ""
-            if hasattr(picture, 'image') and picture.image and hasattr(picture.image, 'uri'):
-                image_uri = str(picture.image.uri)
-
-            caption = ""
-            if hasattr(picture, 'captions') and picture.captions:
-                caption = str(picture.captions[0])
-
+            image_uri = str(picture.image.uri)
+            caption = [caption.cref for caption in picture.captions]
+            footnote = [footnote.cref for footnote in picture.footnotes]
             image_items.append(
                 ChunkData(
                     type=ChunkType.IMAGE,
-                    name=getattr(picture, 'self_ref', None) or f"#/pictures/{idx}",
-                    content=image_uri,
-                    description=caption
+                    name=f"#/pictures/{idx}",
+                    content=ImageDataItem(
+                        uri=image_uri,
+                        caption=caption,
+                        footnote=footnote
+                    )
                 )
             )
 
@@ -145,32 +136,22 @@ def _extract_tables(self, tables: list[TableItem]) -> list[ChunkData]:
         Returns:
             List[ChunkData]: 表格列表
         """
-        # 添加安全检查，确保 tables 参数存在且可迭代
-        if not tables or not hasattr(tables, '__iter__'):
-            return []
-
         table_items: list[ChunkData] = []
         for table in tables:
-            if not hasattr(table, 'data') or not hasattr(table.data, 'grid'):
-                continue
-            if len(table.data.grid) == 0:
-                continue
-
-            table_cells = table.data.grid
-            row_headers = [cell.text for cell in table_cells[0] if cell.row_header]
-            column_headers = [cell.text for cell in table_cells[0] if cell.column_header]
-            data = [[cell.text for cell in row] for row in table_cells[1:]]
+            caption = [caption.cref for caption in table.captions]
+            footnote = [footnote.cref for footnote in table.footnotes]
+            grid = [[cell.text if cell.text else '' for cell in row] for row in table.data.grid]
             table_data = TableDataItem(
                 rows=table.data.num_rows,
                 columns=table.data.num_cols,
-                row_headers=row_headers,
-                column_headers=column_headers,
-                data=data
+                grid=grid,
+                caption=caption,
+                footnote=footnote
             )
             table_items.append(
                 ChunkData(
                     type=ChunkType.TABLE,
-                    name=getattr(table, 'self_ref', None) or f"table-{len(table_items)}",
+                    name=f"#/tables/{len(table_items)}",
                     content=table_data
                 )
             )
@@ -212,16 +193,20 @@ def _extract_texts(self, texts:list[TitleItem|SectionHeaderItem|ListItem|CodeIte
                     text_items.append(
                         ChunkData(
                             type=ChunkType.FORMULA,
-                            name=item.self_ref or f"formula-{len(text_items)}",
-                            content=item.text
+                            name=f"formula-{len(text_items)}",
+                            content=FormulaDataItem(
+                                text=item.text
+                            )
                         )
                     )
                 case _:
                     text_items.append(
                         ChunkData(
                             type=ChunkType.TEXT,
-                            name=f"text-{len(text_items)}",
-                            content=item.text
+                            name=f"#/texts/{len(text_items)}",
+                            content=TextDataItem(
+                                text=item.text
+                            )
                         )
                     )
         return text_items
@@ -25,6 +25,8 @@
     DocumentData,
     DocumentParser,
     TableDataItem,
+    TextDataItem,
+    ImageDataItem
 )
 from parsers.parser_registry import register_parser
 
@@ -64,7 +66,7 @@ def __init__(self, config: ExcelParseConfig | None = None):
         self.config: ExcelParseConfig = config or ExcelParseConfig()
         self.image_index: int = 0
 
-    async def parse(self, file_path: str) -> DocumentData:
+    async def parse(self, file_path: Path) -> DocumentData:
         """
         将Excel文件转换为JSON格式
         Args:
@@ -92,8 +94,9 @@ async def parse(self, file_path: str) -> DocumentData:
                 texts.append(ChunkData(
                     type=ChunkType.TEXT,
                     name=sheet_name,
-                    content=f"工作表 {sheet_index + 1}: {sheet_name}",
-                    description="工作表标题"
+                    content=TextDataItem(
+                        text=f"工作表 {sheet_index + 1}: {sheet_name}",
+                    ),
                 ))
 
                 # 处理图片
@@ -104,9 +107,8 @@ async def parse(self, file_path: str) -> DocumentData:
                 table_content = self._extract_table_data(sheet)
                 tables.append(ChunkData(
                     type=ChunkType.TABLE,
-                    name="表格",
-                    content=table_content,
-                    description="表格"
+                    name=f"#/tables/{sheet_index}",
+                    content=table_content
                 ))
             processing_time = time.time() - start_time
             return DocumentData(
@@ -125,7 +127,7 @@ async def parse(self, file_path: str) -> DocumentData:
                 processing_time=processing_time
             )
 
-    def _load_workbook(self, excel_path: str) -> Workbook:
+    def _load_workbook(self, excel_path: Path) -> Workbook:
         """
         加载Excel工作簿
         Args:
@@ -191,8 +193,9 @@ def _process_image_object(self, img_obj: Image) -> ChunkData | None:
             image_info = ChunkData(
                 type=ChunkType.IMAGE,
                 name=f"#/pictures/{self.image_index}",
-                content=uri,
-                description=self.config.image_description_placeholder
+                content=ImageDataItem(
+                    uri=uri
+                )
             )
 
             self.image_index += 1
@@ -263,8 +266,7 @@ def _extract_table_data(self, sheet: Worksheet) -> TableDataItem:
         return TableDataItem(
             rows=len(all_rows),
             columns=max_col,
-            row_headers=all_rows[0] if all_rows else [],
-            data=all_rows[1:] if len(all_rows) > 1 else []
+            grid=all_rows
         )
 
     def _get_merged_cells(self, sheet: Worksheet) -> dict[tuple[int, int, int, int], str]: