33from enum import Enum
44from typing import Any
55
6- from pydantic import BaseModel
6+ from pydantic import BaseModel , Field
77
88logger = logging .getLogger (__name__ )
99
@@ -19,9 +19,9 @@ class TableDataItem(BaseModel):
1919 """表格数据类"""
2020 rows : int # 行数
2121 columns : int # 列数
22- row_headers : list [Any ] = [] # 行头
23- column_headers : list [Any ] = [] # 列头
24- data : list [list [str ]] = [] # 数据
22+ row_headers : list [Any ] = Field ( default_factory = list ) # 行头
23+ column_headers : list [Any ] = Field ( default_factory = list ) # 列头
24+ data : list [list [str ]] = Field ( default_factory = list ) # 数据
2525
2626class ChunkData (BaseModel ):
2727 """块数据类"""
@@ -33,10 +33,10 @@ class ChunkData(BaseModel):
3333class DocumentData (BaseModel ):
3434 """解析结果类"""
3535 title : str = ""
36- texts : list [ChunkData ] = []
37- tables : list [ChunkData ] = []
38- images : list [ChunkData ] = []
39- formulas : list [ChunkData ] = []
36+ texts : list [ChunkData ] = Field ( default_factory = list )
37+ tables : list [ChunkData ] = Field ( default_factory = list )
38+ images : list [ChunkData ] = Field ( default_factory = list )
39+ formulas : list [ChunkData ] = Field ( default_factory = list )
4040 processing_time : float = 0
4141 success : bool
4242 error_message : str | None = None
@@ -45,7 +45,7 @@ class DocumentParser(ABC):
4545 """文档解析器基类"""
4646
4747 def __init__ (self ) -> None :
48- self .supported_formats : list [str ] = []
48+ self .supported_formats : list [str ] = Field ( default_factory = list )
4949
5050 @abstractmethod
5151 async def parse (self , file_path : str ) -> DocumentData :
@@ -55,4 +55,4 @@ async def parse(self, file_path: str) -> DocumentData:
5555 @abstractmethod
5656 def can_parse (self , file_path : str ) -> bool :
5757 """检查是否可以解析该文件"""
58- pass
58+ return any ( file_path . lower (). endswith ( fmt ) for fmt in self . supported_formats )
0 commit comments