From 0de0b6895f93009df4312360e734caa65652f097 Mon Sep 17 00:00:00 2001 From: Alex Yang Date: Thu, 7 Nov 2024 13:30:44 -0800 Subject: [PATCH] feat: improve type --- .../llama_index/core/readers/file/base.py | 20 ++++++++++++- .../llama_index/readers/box/BoxReader/base.py | 29 +++++++++---------- .../readers/microsoft_onedrive/base.py | 7 +++-- .../readers/minio/minio_client/base.py | 16 +++++----- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py index 3a9ff22f9fe95..e35ff220e0ec9 100644 --- a/llama-index-core/llama_index/core/readers/file/base.py +++ b/llama-index-core/llama_index/core/readers/file/base.py @@ -222,7 +222,25 @@ class DirectoryReaderArgs(BaseDirectoryReaderArgs): num_files_limit: NotRequired[Optional[int]] -class SimpleDirectoryReader(BaseReader, ResourcesReaderMixin, FileSystemReaderMixin): +class DirectoryReaderData: + """ + Base data for directory readers. + """ + + exclude: Optional[List] = None + exclude_hidden: Optional[bool] = None + encoding: Optional[str] = None + errors: Optional[str] = None + recursive: Optional[bool] = None + filename_as_id: Optional[bool] = None + required_exts: Optional[List[str]] = None + raise_on_error: Optional[bool] = None + num_files_limit: Optional[int] = None + + +class SimpleDirectoryReader( + BaseReader, ResourcesReaderMixin, FileSystemReaderMixin, DirectoryReaderData +): """ Simple directory reader. diff --git a/llama-index-integrations/readers/llama-index-readers-box/llama_index/readers/box/BoxReader/base.py b/llama-index-integrations/readers/llama-index-readers-box/llama_index/readers/box/BoxReader/base.py index c8a1a261f3d88..39216677e2d67 100644 --- a/llama-index-integrations/readers/llama-index-readers-box/llama_index/readers/box/BoxReader/base.py +++ b/llama-index-integrations/readers/llama-index-readers-box/llama_index/readers/box/BoxReader/base.py @@ -1,16 +1,21 @@ import logging import tempfile -from typing import List, Optional, Dict, Any, Union +from typing import List, Optional, Dict, Any from pathlib import Path from abc import abstractmethod -from llama_index.core.readers import SimpleDirectoryReader, FileSystemReaderMixin +from typing_extensions import Unpack + +from llama_index.core.readers import ( + SimpleDirectoryReader, + FileSystemReaderMixin, + DirectoryReaderArgs, +) from llama_index.core.readers.base import ( BaseReader, ResourcesReaderMixin, ) from llama_index.core.schema import Document -from llama_index.core.bridge.pydantic import Field from llama_index.readers.box.BoxAPI.box_api import ( add_extra_header_to_box_client, @@ -47,7 +52,9 @@ def class_name(cls) -> str: def __init__( self, box_client: BoxClient, + **kwargs: Unpack[DirectoryReaderArgs], ): + super().__init__(**kwargs) self._box_client = add_extra_header_to_box_client(box_client) @abstractmethod @@ -266,23 +273,15 @@ class BoxReader(BoxReaderBase): Attributes: _box_client (BoxClient): An authenticated Box client object used for interacting with the Box API. - file_extractor (Optional[Dict[str, Union[str, BaseReader]]], optional): - A dictionary mapping file extensions or mimetypes to either a string - specifying a custom extractor function or another BaseReader subclass - for handling specific file formats. Defaults to None. + **kwargs: Additional keyword arguments passed to SimpleDirectoryReader. """ - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = Field( - default=None, exclude=True - ) - def __init__( self, box_client: BoxClient, - file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None, + **kwargs: Unpack[DirectoryReaderArgs], ): - super().__init__(box_client=box_client) - self.file_extractor = file_extractor + super().__init__(box_client=box_client, **kwargs) def load_data( self, @@ -345,7 +344,7 @@ def get_metadata(filename: str) -> Any: simple_loader = SimpleDirectoryReader( input_dir=temp_dir, file_metadata=get_metadata, - file_extractor=self.file_extractor, + **self.model_dump(), ) return simple_loader.load_data() diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-onedrive/llama_index/readers/microsoft_onedrive/base.py b/llama-index-integrations/readers/llama-index-readers-microsoft-onedrive/llama_index/readers/microsoft_onedrive/base.py index e023626d16d11..cfa79b8299305 100644 --- a/llama-index-integrations/readers/llama-index-readers-microsoft-onedrive/llama_index/readers/microsoft_onedrive/base.py +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-onedrive/llama_index/readers/microsoft_onedrive/base.py @@ -12,6 +12,7 @@ from llama_index.core.readers import SimpleDirectoryReader, DirectoryReaderArgs from llama_index.core.readers.base import BasePydanticReader +from llama_index.core.readers.file.base import DirectoryReaderData from llama_index.core.schema import Document from llama_index.core.bridge.pydantic import PrivateAttr, BaseModel from llama_index.core.readers import FileSystemReaderMixin @@ -32,7 +33,9 @@ class _OneDriveResourcePayload(BaseModel): downloaded_file_path: Optional[str] -class OneDriveReader(BasePydanticReader, ResourcesReaderMixin, FileSystemReaderMixin): +class OneDriveReader( + BasePydanticReader, ResourcesReaderMixin, FileSystemReaderMixin, DirectoryReaderData +): """ Microsoft OneDrive reader. @@ -54,7 +57,7 @@ class OneDriveReader(BasePydanticReader, ResourcesReaderMixin, FileSystemReaderM :param file_paths (List[str], optional): List of specific file paths to download. Will be used if the parameter is not provided when calling load_data(). :param file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that file to text. See `SimpleDirectoryReader` for more details. - :param required_exts (Optional[List[str]]): List of required extensions. Default is None. + :param **kwargs (Unpack[DirectoryReaderArgs]): Additional arguments to pass to the directory reader. For interactive authentication to work, a browser is used to authenticate, hence the registered application should have a redirect URI set to 'https://localhost' diff --git a/llama-index-integrations/readers/llama-index-readers-minio/llama_index/readers/minio/minio_client/base.py b/llama-index-integrations/readers/llama-index-readers-minio/llama_index/readers/minio/minio_client/base.py index 0f6b91b0318a5..afc0a8f9d0bc1 100644 --- a/llama-index-integrations/readers/llama-index-readers-minio/llama_index/readers/minio/minio_client/base.py +++ b/llama-index-integrations/readers/llama-index-readers-minio/llama_index/readers/minio/minio_client/base.py @@ -8,12 +8,14 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union -from llama_index.core.readers import SimpleDirectoryReader -from llama_index.core.readers.base import BaseReader +from typing_extensions import Unpack + +from llama_index.core.readers import SimpleDirectoryReader, DirectoryReaderArgs +from llama_index.core.readers.base import BaseReader, BasePydanticReader from llama_index.core.schema import Document -class MinioReader(BaseReader): +class MinioReader(BaseReader, BasePydanticReader): """General reader for any Minio file or directory.""" def __init__( @@ -33,7 +35,7 @@ def __init__( minio_access_key: Optional[str] = None, minio_secret_key: Optional[str] = None, minio_session_token: Optional[str] = None, - **kwargs: Any, + **kwargs: Unpack[DirectoryReaderArgs], ) -> None: """Initialize Minio bucket and key, along with credentials if needed. @@ -45,11 +47,6 @@ def __init__( this loader will iterate through the entire bucket. prefix (Optional[str]): the prefix to filter by in the case that the loader iterates through the entire bucket. Defaults to empty string. - file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file - extension to a BaseReader class that specifies how to convert that file - to text. See `SimpleDirectoryReader` for more details. - required_exts (Optional[List[str]]): List of required extensions. - Default is None. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. file_metadata (Optional[Callable[str, Dict]]): A function that takes @@ -62,6 +59,7 @@ def __init__( minio_session_token (Optional[str]): The Minio session token. minio_secure: MinIO server runs in TLS mode minio_cert_check: allows the usage of a self-signed cert for MinIO server + **kwargs: Additional arguments to pass to the simple directory reader. """ super().__init__(*args, **kwargs)