From 4b97e39815c9c0f247203e706798a16860a2262f Mon Sep 17 00:00:00 2001 From: EdgeNeko Date: Thu, 9 May 2024 21:54:20 +0800 Subject: [PATCH 1/3] Add local metadata storage support --- app/Services/vector_db_context.py | 21 ++++++++++---- app/config.py | 14 ++++++++-- app/util/retry_deco_async.py | 6 ++-- app/webapp.py | 1 + config/default.env | 46 ++++++++++++++++++++++++------- readme.md | 19 ++++++++++++- readme_cn.md | 23 ++++++++++++++-- tests/unit/test_retry_deco.py | 2 ++ 8 files changed, 105 insertions(+), 27 deletions(-) diff --git a/app/Services/vector_db_context.py b/app/Services/vector_db_context.py index 36fc7f8..1e4b624 100644 --- a/app/Services/vector_db_context.py +++ b/app/Services/vector_db_context.py @@ -12,7 +12,7 @@ from app.Models.img_data import ImageData from app.Models.query_params import FilterParams from app.Models.search_result import SearchResult -from app.config import config +from app.config import config, QdrantMode from app.util.retry_deco_async import wrap_object, retry_async @@ -28,11 +28,20 @@ class VectorDbContext: AVAILABLE_POINT_TYPES = models.Record | models.ScoredPoint | models.PointStruct def __init__(self): - self._client = AsyncQdrantClient(host=config.qdrant.host, port=config.qdrant.port, - grpc_port=config.qdrant.grpc_port, api_key=config.qdrant.api_key, - prefer_grpc=config.qdrant.prefer_grpc) - - wrap_object(self._client, retry_async((AioRpcError, HTTPError))) + match config.qdrant.mode: + case QdrantMode.SERVER: + self._client = AsyncQdrantClient(host=config.qdrant.host, port=config.qdrant.port, + grpc_port=config.qdrant.grpc_port, api_key=config.qdrant.api_key, + prefer_grpc=config.qdrant.prefer_grpc) + wrap_object(self._client, retry_async((AioRpcError, HTTPError))) + case QdrantMode.LOCAL: + self._client = AsyncQdrantClient(local_file=config.qdrant.local_file) + case QdrantMode.MEMORY: + logger.warning("Using in-memory Qdrant client. Data will be lost after application restart. " + "This should only be used for testing and debugging.") + self._client = AsyncQdrantClient(":memory:") + case _: + raise ValueError("Invalid Qdrant mode.") self.collection_name = config.qdrant.coll async def retrieve_by_id(self, image_id: str, with_vectors=False) -> ImageData: diff --git a/app/config.py b/app/config.py index 01a9d31..bbe71f3 100644 --- a/app/config.py +++ b/app/config.py @@ -8,7 +8,15 @@ DOCKER_SECRETS_DIR = '/run/secrets' +class QdrantMode(str, Enum): + SERVER = 'server' + LOCAL = 'local' + MEMORY = 'memory' + + class QdrantSettings(BaseModel): + mode: QdrantMode = QdrantMode.SERVER + host: str = 'localhost' port: int = 6333 grpc_port: int = 6334 @@ -16,6 +24,8 @@ class QdrantSettings(BaseModel): prefer_grpc: bool = True api_key: str | None = None + local_path: str = './images_metadata' + class ModelsSettings(BaseModel): clip: str = 'openai/clip-vit-large-patch14' @@ -55,7 +65,7 @@ def enabled(self): class StorageSettings(BaseModel): - method: StorageMode = StorageMode.LOCAL # set designed to be "disabled" for compatibility checking in StaticFileSettings + method: StorageMode = StorageMode.LOCAL s3: S3StorageSettings = S3StorageSettings() local: LocalStorageSettings = LocalStorageSettings() @@ -95,8 +105,6 @@ class Environment(BaseModel): def _check_deprecated_settings(_config): if _config.static_file.path != '[DEPRECATED]': logger.warning("Config StaticFileSettings is deprecated and should not be set.") - # if _config.storage.method == '[DISABLED]': - # raise DeprecationWarning("Config StaticFileSettings is deprecated, use StorageSettings instead!") config = Config() diff --git a/app/util/retry_deco_async.py b/app/util/retry_deco_async.py index c818bed..c705350 100644 --- a/app/util/retry_deco_async.py +++ b/app/util/retry_deco_async.py @@ -27,7 +27,5 @@ async def f_retry(*args, **kwargs): def wrap_object(obj: object, deco: Callable[[Callable], Callable]): for attr in dir(obj): - if not attr.startswith('_'): - attr_val = getattr(obj, attr) - if callable(attr_val) and asyncio.iscoroutinefunction(attr_val): - setattr(obj, attr, deco(getattr(obj, attr))) + if not attr.startswith('_') and asyncio.iscoroutinefunction(attr_val := getattr(obj, attr)): + setattr(obj, attr, deco(attr_val)) diff --git a/app/webapp.py b/app/webapp.py index 7f283ae..1e554c5 100644 --- a/app/webapp.py +++ b/app/webapp.py @@ -21,6 +21,7 @@ @asynccontextmanager async def lifespan(_: FastAPI): provider = ServiceProvider() + search_controller.services = provider admin_controller.services = provider yield diff --git a/config/default.env b/config/default.env index 6ef07a8..caaf218 100644 --- a/config/default.env +++ b/config/default.env @@ -3,7 +3,16 @@ # You can also use environment variables or docker secrets to set these values (the key should correspond to the key below). # Checkout https://docs.pydantic.dev/latest/concepts/pydantic_settings/ for more information. +# ------ # Vector Database Configuration +# ------ +# Mode for the vector database, options includes "server" (default), "local" and "memory" +# - server: The preferred mode, uses Qdrant server for vector storage. +# - local: Store vectors as a file on the local disk, this is not recommended for production use (see readme for more information) +# - memory: Uses in-memory storage for vector storage, this is not persistent and should only be used for testing and debugging. +# APP_QDRANT__MODE=server + +# Remote Qdrant Server Configuration # Hostname or IP address of the Qdrant server # APP_QDRANT__HOST="localhost" # Port number for the Qdrant HTTP server @@ -17,11 +26,23 @@ # Collection name to use in Qdrant # APP_QDRANT__COLL="NekoImg" -# Inference device Configuration -# Setting this to "auto" allows the system to automatically detect and use available devices, otherwise specify the device name +# Local Qdrant File Configuration +# Path to the file where vectors will be stored +# APP_QDRANT__LOCAL_PATH="./images_metadata" + + +# ------ +# Server Configuration +# ------ +# Specify device to be used while inferencing vectors by PyTorch. Setting this to "auto" allows the system to automatically detect and use available devices, otherwise specify the device name # APP_DEVICE="auto" +# List of allowed origins for CORS (Cross-Origin Resource Sharing) +# APP_CORS_ORIGINS=["*"] + +# ------ # Models Configuration +# ------ # Model used for CLIP embeddings (Vision Search), accepts both huggingface hub (transformers) model name and path to the model. # APP_MODEL__CLIP="openai/clip-vit-large-patch14" # Model used for BERT embeddings (OCR Search), accepts both huggingface hub (transformers) model name and path to the model. @@ -30,35 +51,40 @@ # APP_MODEL__EASYPADDLEOCR="" -# BERT Configuration +# ------ +# OCR Search Configuration +# ------ # Enable OCR search functionality # APP_OCR_SEARCH__ENABLE=True # OCR module to use for text extraction # APP_OCR_SEARCH__OCR_MODULE="easypaddleocr" -# BERT model to use for text embedding -# APP_OCR_SEARCH__BERT_MODEL="bert-base-chinese" # Minimum confidence level required for OCR results to be considered # APP_OCR_SEARCH__OCR_MIN_CONFIDENCE=1e-2 # List of languages supported by the OCR module # APP_OCR_SEARCH__OCR_LANGUAGE=["ch_sim", "en"] -# Server Configuration -# List of allowed origins for CORS (Cross-Origin Resource Sharing) -# APP_CORS_ORIGINS=["*"] +# ------ +# Admin API Configuration +# ------ # Set to True to enable admin API, this allows you to access the admin API using the token specified below. # APP_ADMIN_API_ENABLE=False - # Uncomment the line below if you enabled admin API. Use this token to access admin API. For security reasons, the admin token is always required if you want to use admin API. # APP_ADMIN_TOKEN="your-super-secret-admin-token" + +# ------ # Access Protection Configuration +# ------ # Set to True to enable access protection using tokens # APP_ACCESS_PROTECTED=False # Use this token to access the API. This is required if you enabled access protection. # APP_ACCESS_TOKEN="your-super-secret-access-token" -# Storage Settings - Global + +# ------ +# Storage Settings +# ------ # Method for storing files, options includes "local", "s3" and "disabled" # APP_STORAGE__METHOD="local" diff --git a/readme.md b/readme.md index f3a909c..1940c33 100644 --- a/readme.md +++ b/readme.md @@ -34,7 +34,12 @@ image search. ### 🖥️ Local Deployment -#### Deploy Qdrant Database +#### Choose a metadata storage method + +##### Qdrant Database (Recommended) + +In most cases, we recommend using the Qdrant database to store metadata. The Qdrant database provides efficient +retrieval performance, flexible scalability, and better data security. Please deploy the Qdrant database according to the [Qdrant documentation](https://qdrant.tech/documentation/quick-start/). It is recommended to use Docker for @@ -43,6 +48,18 @@ deployment. If you don't want to deploy Qdrant yourself, you can use the [online service provided by Qdrant](https://qdrant.tech/documentation/cloud/). +##### Local File Storage + +Local file storage directly stores image metadata (including feature vectors, etc.) in a local SQLite database. It is +only recommended for small-scale deployments or development deployments. + +Local file storage does not require an additional database deployment process, but has the following disadvantages: + +- Local storage does not index and optimize vectors, so the time complexity of all searches is `O(n)`. Therefore, if the + data scale is large, the performance of search and indexing will decrease. +- Using local file storage will make NekoImageGallery stateful, so it will lose horizontal scalability. +- When you want to migrate to Qdrant database for storage, the indexed metadata may be difficult to migrate directly. + #### Deploy NekoImageGallery 1. Clone the project directory to your own PC or server. diff --git a/readme_cn.md b/readme_cn.md index a604ba4..36a0ff5 100644 --- a/readme_cn.md +++ b/readme_cn.md @@ -29,13 +29,30 @@ ## ✈️部署 -### 本地部署 -#### 部署Qdrant数据库 +### 🖥️ 本地部署 + +#### 选择元数据存储方式 + +NekoImageGallery支持两种元数据存储方式:Qdrant数据库存储与本地文件存储。您可以根据自己的需求选择其中一种方式。 + +##### Qdrant数据库 (推荐) + +在大多数情况下,我们推荐使用Qdrant数据库存储元数据。Qdrant数据库提供了高效的检索性能,灵活的扩展性以及更好的数据安全性。 请根据[Qdrant文档](https://qdrant.tech/documentation/quick-start/)部署Qdrant数据库,推荐使用docker部署。 如果你不想自己部署Qdrant,可以使用[Qdrant官方提供的在线服务](https://qdrant.tech/documentation/cloud/)。 +##### 本地文件存储 + +本地文件存储直接将图片元数据(包括特征向量等)存在本地的Sqlite数据库中。仅建议在小规模部署或开发部署中使用。 + +本地文件存储不需要额外的数据库部署流程,但是存在以下缺点: + +- 本地存储没有对向量进行索引和优化,所有搜索的时间复杂度为`O(n)`,因此若数据规模较大,搜索与索引的性能会下降。 +- 使用本地文件存储会使得NekoImageGallery变得有状态,因此会丧失横向扩展能力。 +- 当你希望迁移到Qdrant数据库进行存储时,已索引的元数据可能难以直接迁移。 + #### 部署NekoImageGallery 1. 将项目目录clone到你自己的PC或服务器中。 2. 强烈建议在python venv虚拟环境中安装本项目所需依赖, 运行下面命令: @@ -75,7 +92,7 @@ 你可以通过`--host`指定希望绑定到的ip地址(默认为0.0.0.0),通过`--port`指定希望绑定到的端口(默认为8000)。 9. (可选)部署前端应用:[NekoImageGallery.App](https://github.com/hv0905/NekoImageGallery.App)是本项目的一个简易web前端应用,如需部署请参照它的[部署文档](https://github.com/hv0905/NekoImageGallery.App)。 -### Docker Compose容器化部署 +### 🐋 Docker 部署 > [!WARNING] > Docker Compose部署方式的支持目前仍处在alpha状态,可能不适用于所有环境(尤其是CUDA加速功能)。 diff --git a/tests/unit/test_retry_deco.py b/tests/unit/test_retry_deco.py index f9d47e7..4a37eef 100644 --- a/tests/unit/test_retry_deco.py +++ b/tests/unit/test_retry_deco.py @@ -10,6 +10,7 @@ class ExampleClass: def __init__(self): self.counter = 0 self.counter2 = 0 + self.not_func = 'not a function' async def example_method(self): await asyncio.sleep(0) @@ -37,6 +38,7 @@ def caller(): async def test_object_wrapper(self): obj = self.ExampleClass() wrap_object(obj, retry_async(ValueError, tries=2)) + assert isinstance(obj.not_func, str) with pytest.raises(ValueError): await obj.example_method() assert await obj.example_method() == 3 From 6b2089ca0bd46733eb6a85426e7025553329a25a Mon Sep 17 00:00:00 2001 From: pk5ls20 Date: Thu, 9 May 2024 23:25:28 +0800 Subject: [PATCH 2/3] Update app/Services/vector_db_context.py --- app/Services/vector_db_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/Services/vector_db_context.py b/app/Services/vector_db_context.py index 1e4b624..9f21f40 100644 --- a/app/Services/vector_db_context.py +++ b/app/Services/vector_db_context.py @@ -35,7 +35,7 @@ def __init__(self): prefer_grpc=config.qdrant.prefer_grpc) wrap_object(self._client, retry_async((AioRpcError, HTTPError))) case QdrantMode.LOCAL: - self._client = AsyncQdrantClient(local_file=config.qdrant.local_file) + self._client = AsyncQdrantClient(path=config.qdrant.local_path) case QdrantMode.MEMORY: logger.warning("Using in-memory Qdrant client. Data will be lost after application restart. " "This should only be used for testing and debugging.") From de4fd3e34e33e4657e5057c96e1d3b88805366e2 Mon Sep 17 00:00:00 2001 From: EdgeNeko Date: Thu, 9 May 2024 23:42:45 +0800 Subject: [PATCH 3/3] Initialize collection if not present when startup --- .gitignore | 1 + app/Services/provider.py | 3 +++ app/Services/vector_db_context.py | 5 +++++ app/webapp.py | 1 + scripts/db_migrations.py | 1 + scripts/local_create_thumbnail.py | 1 + scripts/local_indexing.py | 1 + 7 files changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 1b256a7..b00d08f 100644 --- a/.gitignore +++ b/.gitignore @@ -241,5 +241,6 @@ cython_debug/ static/ qdrant_data/ +images_metadata/ local_*/ .idea \ No newline at end of file diff --git a/app/Services/provider.py b/app/Services/provider.py index c36cb57..c85f36c 100644 --- a/app/Services/provider.py +++ b/app/Services/provider.py @@ -44,3 +44,6 @@ def __init__(self): if config.admin_api_enable: self.upload_service = UploadService(self.storage_service, self.db_context, self.index_service) + + async def onload(self): + await self.db_context.onload() diff --git a/app/Services/vector_db_context.py b/app/Services/vector_db_context.py index 9f21f40..0b78389 100644 --- a/app/Services/vector_db_context.py +++ b/app/Services/vector_db_context.py @@ -44,6 +44,11 @@ def __init__(self): raise ValueError("Invalid Qdrant mode.") self.collection_name = config.qdrant.coll + async def onload(self): + if not await self.check_collection(): + logger.warning("Collection not found. Initializing...") + await self.initialize_collection() + async def retrieve_by_id(self, image_id: str, with_vectors=False) -> ImageData: """ Retrieve an item from database by id. Will raise PointNotFoundError if the given ID doesn't exist. diff --git a/app/webapp.py b/app/webapp.py index 1e554c5..d7c948c 100644 --- a/app/webapp.py +++ b/app/webapp.py @@ -21,6 +21,7 @@ @asynccontextmanager async def lifespan(_: FastAPI): provider = ServiceProvider() + await provider.onload() search_controller.services = provider admin_controller.services = provider diff --git a/scripts/db_migrations.py b/scripts/db_migrations.py index fc76200..a457a26 100644 --- a/scripts/db_migrations.py +++ b/scripts/db_migrations.py @@ -34,6 +34,7 @@ async def migrate_v1_v2(): async def migrate(from_version: int): global services services = ServiceProvider() + await services.onload() match from_version: case 1: await migrate_v1_v2() diff --git a/scripts/local_create_thumbnail.py b/scripts/local_create_thumbnail.py index 1011967..160e870 100644 --- a/scripts/local_create_thumbnail.py +++ b/scripts/local_create_thumbnail.py @@ -9,6 +9,7 @@ async def main(): services = ServiceProvider() + await services.onload() # Here path maybe either local path or pure path count = 0 async for item in services.storage_service.active_storage.list_files("", '*.*', batch_max_files=1): diff --git a/scripts/local_indexing.py b/scripts/local_indexing.py index d696c85..9c38422 100644 --- a/scripts/local_indexing.py +++ b/scripts/local_indexing.py @@ -51,6 +51,7 @@ async def copy_and_index_batch(file_path_list: list[tuple[Path, str]]): async def main(args): global services services = ServiceProvider() + await services.onload() root = Path(args.local_index_target_dir) # First, check if the database is empty item_number = await services.db_context.get_counts(exact=False)