From 0f5ca81b8bac003932b04ddfe6fa45c7dbfc3e2c Mon Sep 17 00:00:00 2001 From: Vitali Yanushchyk Date: Wed, 27 Nov 2024 06:39:01 -0300 Subject: [PATCH] add ! hde client --- pyproject.toml | 1 + src/country_workspace/config/__init__.py | 2 + src/country_workspace/config/fragments/app.py | 4 ++ .../config/fragments/constance.py | 15 +++- src/country_workspace/config/settings.py | 1 + .../contrib/dedup_engine/__init__.py | 0 .../contrib/dedup_engine/adapters/__init__.py | 4 ++ .../contrib/dedup_engine/adapters/base.py | 72 +++++++++++++++++++ .../adapters/deduplication_set.py | 39 ++++++++++ .../dedup_engine/adapters/duplicate.py | 26 +++++++ .../contrib/dedup_engine/adapters/ignored.py | 45 ++++++++++++ .../contrib/dedup_engine/adapters/image.py | 53 ++++++++++++++ .../contrib/dedup_engine/adapters/mixins.py | 44 ++++++++++++ .../contrib/dedup_engine/apps.py | 6 ++ .../contrib/dedup_engine/client.py | 64 +++++++++++++++++ .../contrib/dedup_engine/endpoints.py | 19 +++++ .../contrib/dedup_engine/models/__init__.py | 4 ++ .../contrib/dedup_engine/models/constants.py | 3 + .../dedup_engine/models/deduplication_set.py | 32 +++++++++ .../contrib/dedup_engine/models/duplicate.py | 15 ++++ .../contrib/dedup_engine/models/ignored.py | 16 +++++ .../contrib/dedup_engine/models/image.py | 21 ++++++ .../contrib/dedup_engine/models/util.py | 58 +++++++++++++++ .../contrib/dedup_engine/types.py | 22 ++++++ uv.lock | 50 +++++++++++++ 25 files changed, 615 insertions(+), 1 deletion(-) create mode 100644 src/country_workspace/contrib/dedup_engine/__init__.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/__init__.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/base.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/deduplication_set.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/duplicate.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/ignored.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/image.py create mode 100644 src/country_workspace/contrib/dedup_engine/adapters/mixins.py create mode 100644 src/country_workspace/contrib/dedup_engine/apps.py create mode 100644 src/country_workspace/contrib/dedup_engine/client.py create mode 100644 src/country_workspace/contrib/dedup_engine/endpoints.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/__init__.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/constants.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/deduplication_set.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/duplicate.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/ignored.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/image.py create mode 100644 src/country_workspace/contrib/dedup_engine/models/util.py create mode 100644 src/country_workspace/contrib/dedup_engine/types.py diff --git a/pyproject.toml b/pyproject.toml index 145ff0f..a8d4b19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "sentry-sdk>=2.7.1", "social-auth-app-django", "unicef-security>=1.5.1", + "pydantic>=2.10.1", ] [project.scripts] celery-monitor = "country_workspace.__monitor__:run" diff --git a/src/country_workspace/config/__init__.py b/src/country_workspace/config/__init__.py index f20e3a0..c6cedb6 100644 --- a/src/country_workspace/config/__init__.py +++ b/src/country_workspace/config/__init__.py @@ -100,6 +100,8 @@ class Group(Enum): "https://django-environ.readthedocs.io/en/latest/types.html#environ-env-db-url", ), "DEBUG": (bool, False, True, False, setting("debug")), + "DEDUP_ENGINE_API_TOKEN": (str, "", "", False, "Dedup engine API token"), + "DEDUP_ENGINE_API_URL": (str, "", "", False, "Dedup engine API url"), # "EMAIL_BACKEND": ( # str, # "django.core.mail.backends.smtp.EmailBackend", diff --git a/src/country_workspace/config/fragments/app.py b/src/country_workspace/config/fragments/app.py index ee93ee9..c044793 100644 --- a/src/country_workspace/config/fragments/app.py +++ b/src/country_workspace/config/fragments/app.py @@ -12,6 +12,10 @@ HOPE_API_TOKEN = env("HOPE_API_TOKEN") HOPE_API_URL = env("HOPE_API_URL") +DEDUP_ENGINE_API_TOKEN = env("DEDUP_ENGINE_API_TOKEN") +DEDUP_ENGINE_API_URL = env("DEDUP_ENGINE_API_URL") + + HH_LOOKUPS = [ "ResidenceStatus", ] diff --git a/src/country_workspace/config/fragments/constance.py b/src/country_workspace/config/fragments/constance.py index b102843..95c0710 100644 --- a/src/country_workspace/config/fragments/constance.py +++ b/src/country_workspace/config/fragments/constance.py @@ -1,4 +1,12 @@ -from .app import AURORA_API_TOKEN, AURORA_API_URL, HOPE_API_TOKEN, HOPE_API_URL, NEW_USER_DEFAULT_GROUP +from country_workspace.config.fragments.app import ( + AURORA_API_TOKEN, + AURORA_API_URL, + DEDUP_ENGINE_API_TOKEN, + DEDUP_ENGINE_API_URL, + HOPE_API_TOKEN, + HOPE_API_URL, + NEW_USER_DEFAULT_GROUP, +) CONSTANCE_BACKEND = "constance.backends.database.DatabaseBackend" @@ -46,6 +54,9 @@ "AURORA_API_URL": (AURORA_API_URL, "Aurora API Server address", str), "HOPE_API_TOKEN": (HOPE_API_TOKEN, "HOPE API Access Token", "write_only_input"), "HOPE_API_URL": (HOPE_API_URL, "HOPE API Server address", str), + "DEDUP_ENGINE_API_TOKEN": (DEDUP_ENGINE_API_TOKEN, "Dedup engine API Access Token", "write_only_input"), + "DEDUP_ENGINE_API_URL": (DEDUP_ENGINE_API_URL, "Dedup engine API Server address", str), + "KOBO_API_URL": ("", "Kobo API Server address", str), "KOBO_API_TOKEN": ("", "Kobo API Access Token", "write_only_input"), "KOBO_API_URL": ("", "Kobo API Server address", str), "CACHE_TIMEOUT": (86400, "Cache Redis TTL", int), @@ -58,6 +69,8 @@ "Remote System Tokens": ( "AURORA_API_TOKEN", "AURORA_API_URL", + "DEDUP_ENGINE_API_TOKEN", + "DEDUP_ENGINE_API_URL", "HOPE_API_TOKEN", "HOPE_API_URL", "KOBO_API_TOKEN", diff --git a/src/country_workspace/config/settings.py b/src/country_workspace/config/settings.py index 1d09ab9..b59d66a 100644 --- a/src/country_workspace/config/settings.py +++ b/src/country_workspace/config/settings.py @@ -50,6 +50,7 @@ "country_workspace.security", "country_workspace.apps.HCWConfig", "country_workspace.workspaces.apps.Config", + "country_workspace.contrib.dedup_engine.apps.Config", "country_workspace.versioning", "country_workspace.cache", # these should be optional in the future diff --git a/src/country_workspace/contrib/dedup_engine/__init__.py b/src/country_workspace/contrib/dedup_engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/country_workspace/contrib/dedup_engine/adapters/__init__.py b/src/country_workspace/contrib/dedup_engine/adapters/__init__.py new file mode 100644 index 0000000..a50d246 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/__init__.py @@ -0,0 +1,4 @@ +from .deduplication_set import DeduplicationSetAdapter # noqa +from .duplicate import DuplicateAdapter # noqa +from .ignored import IgnoredAdapter # noqa +from .image import ImageAdapter # noqa diff --git a/src/country_workspace/contrib/dedup_engine/adapters/base.py b/src/country_workspace/contrib/dedup_engine/adapters/base.py new file mode 100644 index 0000000..af42c15 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/base.py @@ -0,0 +1,72 @@ +from enum import Enum +from typing import Any, Generic, Type + +from requests import Session + +from country_workspace.contrib.dedup_engine.adapters.mixins import URLMixin, ValidationMixin +from country_workspace.contrib.dedup_engine.endpoints import Endpoint +from country_workspace.contrib.dedup_engine.types import TCreate, TModel + + +class HTTPMethod(Enum): + GET = "GET" + POST = "POST" + DELETE = "DELETE" + + +class BaseAdapter(Generic[TModel, TCreate], URLMixin, ValidationMixin): + def __init__( + self, session: Session, endpoints: Endpoint, model_class: Type[TModel], create_class: Type[TCreate] = None + ) -> None: + self.session = session + self.endpoints = endpoints + self.model_class = model_class + self.create_class = create_class or model_class + + def list(self, url_path: str, **kwargs) -> list[TModel]: + url = self.prepare_url(url_path, **kwargs) + response = self._request(HTTPMethod.GET.value, url) + return [self.model_class(**item) for item in response.json()] + + def retrieve(self, url_path: str, **kwargs) -> TModel: + url = self.prepare_url(url_path, **kwargs) + response = self._request(HTTPMethod.GET.value, url) + return self.model_class(**response.json()) + + def create(self, url_path: str, data: TCreate, **kwargs) -> TModel: + url = self.prepare_url(url_path, **kwargs) + response = self._request( + HTTPMethod.POST.value, url, json=self.validate_data(data, self.create_class).model_dump(by_alias=True) + ) + print(f"{response.json()=}") + return self.model_class(**response.json()) + + def destroy(self, url_path: str, **kwargs) -> None: + url = self.prepare_url(url_path, **kwargs) + response = self._request(HTTPMethod.DELETE.value, url) + if response.status_code != 204: + response.raise_for_status() + + def update(self, url_path: str, data: TModel, **kwargs) -> TModel: + raise NotImplementedError("Update method is not implemented") + + def _request(self, method: str, url: str, **kwargs) -> Any: + print(f"{method} {url=}") + response = self.session.request(method, url, **kwargs) + response.raise_for_status() + return response + + def _action( + self, + method: HTTPMethod, + url_path: str, + *, + path_params: dict[str, Any] = None, + data: Any = None, + ) -> Any: + path_params = path_params or {} + url = self.prepare_url(url_path, **path_params) + response = self._request(method.value, url, json=data) + if response.content: + return response.json() + return None diff --git a/src/country_workspace/contrib/dedup_engine/adapters/deduplication_set.py b/src/country_workspace/contrib/dedup_engine/adapters/deduplication_set.py new file mode 100644 index 0000000..18d1491 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/deduplication_set.py @@ -0,0 +1,39 @@ +from typing import override +from uuid import UUID + +from country_workspace.contrib.dedup_engine.adapters.base import BaseAdapter, HTTPMethod +from country_workspace.contrib.dedup_engine.models import DeduplicationSet, DeduplicationSetCreate + + +class DeduplicationSetAdapter(BaseAdapter[DeduplicationSet, DeduplicationSetCreate]): + @override + def list(self) -> list[DeduplicationSet]: + return super().list(url_path=self.endpoints.deduplication_set) + + @override + def retrieve(self, data: DeduplicationSet | UUID) -> DeduplicationSet: + return super().retrieve( + url_path=self.endpoints.deduplication_set_detail, + deduplication_set_id=self.get_entity_id(data), + ) + + @override + def create(self, data: DeduplicationSetCreate) -> DeduplicationSet: + return super().create( + url_path=self.endpoints.deduplication_set, + data=data, + ) + + @override + def destroy(self, data: DeduplicationSet | UUID) -> None: + super().destroy( + url_path=self.endpoints.deduplication_set_detail, + deduplication_set_id=self.get_entity_id(data), + ) + + def process(self, data: DeduplicationSet | UUID) -> None: + self._action( + HTTPMethod.POST, + self.endpoints.process, + path_params={"deduplication_set_id": self.get_entity_id(data)}, + ) diff --git a/src/country_workspace/contrib/dedup_engine/adapters/duplicate.py b/src/country_workspace/contrib/dedup_engine/adapters/duplicate.py new file mode 100644 index 0000000..1513e50 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/duplicate.py @@ -0,0 +1,26 @@ +from typing import override +from uuid import UUID + +from country_workspace.contrib.dedup_engine.adapters.base import BaseAdapter +from country_workspace.contrib.dedup_engine.models import DeduplicationSet, Duplicate + + +class DuplicateAdapter(BaseAdapter[Duplicate, None]): + @override + def list(self, deduplication_set: DeduplicationSet | UUID) -> list[Duplicate]: + return super().list( + url_path=self.endpoints.duplicate, + deduplication_set_id=self.get_entity_id(deduplication_set), + ) + + @override + def retrieve(self, *args, **kwargs) -> None: + raise NotImplementedError("Retrieval of Duplicate objects is not supported.") + + @override + def create(self, *args, **kwargs) -> None: + raise NotImplementedError("Creation of Duplicate objects is not supported.") + + @override + def destroy(self, *args, **kwargs) -> None: + raise NotImplementedError("Deletion of Duplicate objects is not supported.") diff --git a/src/country_workspace/contrib/dedup_engine/adapters/ignored.py b/src/country_workspace/contrib/dedup_engine/adapters/ignored.py new file mode 100644 index 0000000..0649112 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/ignored.py @@ -0,0 +1,45 @@ +from typing import Type, override +from uuid import UUID + +from requests import Session + +from country_workspace.contrib.dedup_engine.adapters.base import BaseAdapter, Endpoint +from country_workspace.contrib.dedup_engine.models import DeduplicationSet, Ignored, IgnoredCreate + + +class IgnoredAdapter(BaseAdapter[Ignored, IgnoredCreate]): + def __init__( + self, + session: Session, + endpoints: Endpoint, + model_class: Type[Ignored], + create_class: Type[IgnoredCreate], + resource_type: str, + ) -> None: + super().__init__(session, endpoints, model_class, create_class) + self.resource_type = resource_type + + @override + def list(self, deduplication_set: DeduplicationSet | UUID) -> list[Ignored]: + return super().list( + url_path=self.endpoints.ignored, + deduplication_set_id=self.get_entity_id(deduplication_set), + resource_type=self.resource_type, + ) + + @override + def create(self, deduplication_set: DeduplicationSet | UUID, data: IgnoredCreate) -> Ignored: + return super().create( + url_path=self.endpoints.ignored, + deduplication_set_id=self.get_entity_id(deduplication_set), + resource_type=self.resource_type, + data=data, + ) + + @override + def retrieve(self, *args, **kwargs) -> None: + raise NotImplementedError("Retrieval of Ignored objects is not supported.") + + @override + def destroy(self, *args, **kwargs) -> None: + raise NotImplementedError("Deletion of Ignored objects is not supported.") diff --git a/src/country_workspace/contrib/dedup_engine/adapters/image.py b/src/country_workspace/contrib/dedup_engine/adapters/image.py new file mode 100644 index 0000000..5a12a47 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/image.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from typing import override +from uuid import UUID + +from country_workspace.contrib.dedup_engine.adapters.base import BaseAdapter, HTTPMethod +from country_workspace.contrib.dedup_engine.models import DeduplicationSet, Image, ImageCreate + + +class ImageAdapter(BaseAdapter[Image, ImageCreate]): + + @override + def list(self, deduplication_set: DeduplicationSet | UUID) -> list[Image]: + return super().list( + url_path=self.endpoints.image, + deduplication_set_id=self.get_entity_id(deduplication_set), + ) + + @override + def create(self, deduplication_set: DeduplicationSet | UUID, data: ImageCreate) -> Image: + return super().create( + url_path=self.endpoints.image, + deduplication_set_id=self.get_entity_id(deduplication_set), + data=data, + ) + + @override + def destroy(self, deduplication_set: DeduplicationSet | UUID, image: Image | UUID) -> None: + return super().destroy( + url_path=self.endpoints.image_detail, + deduplication_set_id=self.get_entity_id(deduplication_set), + image_id=self.get_entity_id(image), + ) + + def create_bulk(self, deduplication_set: DeduplicationSet | UUID, data: list[ImageCreate]) -> list[Image]: + validated_data = [self.validate_data(item, ImageCreate).model_dump(by_alias=True) for item in data] + response_data = self._action( + HTTPMethod.POST, + self.endpoints.image_bulk, + path_params={"deduplication_set_id": self.get_entity_id(deduplication_set)}, + data=validated_data, + ) + return [Image(**item) for item in response_data] + + def destroy_bulk(self, deduplication_set: DeduplicationSet | UUID) -> None: + self._action( + HTTPMethod.DELETE, + self.endpoints.image_bulk_clear, + path_params={"deduplication_set_id": self.get_entity_id(deduplication_set)}, + ) + + def retrieve(self, *args, **kwargs) -> None: + raise NotImplementedError("Retrieval of Image objects is not supported.") diff --git a/src/country_workspace/contrib/dedup_engine/adapters/mixins.py b/src/country_workspace/contrib/dedup_engine/adapters/mixins.py new file mode 100644 index 0000000..090733b --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/adapters/mixins.py @@ -0,0 +1,44 @@ +from typing import Any, Type +from urllib.parse import urljoin +from uuid import UUID + +from country_workspace.contrib.dedup_engine.models import DeduplicationSet, Image +from country_workspace.contrib.dedup_engine.types import TModel + + +class URLMixin: + def prepare_url(self, path: str, **kwargs) -> str: + try: + formatted_path = path.format(**kwargs) + except KeyError as e: + raise ValueError(f"Missing placeholder '{e.args[0]}' in kwargs for path: '{path}'") + return urljoin(self.endpoints.base, formatted_path) + + +class ValidationMixin: + @staticmethod + def get_entity_id(entity: DeduplicationSet | Image | UUID, id_field: str = "id") -> UUID: + match entity: + case UUID(): + return entity + case _ if isinstance(entity, (DeduplicationSet, Image)): + try: + return getattr(entity, id_field) + except AttributeError: + raise AttributeError(f"'{type(entity).__name__}' does not have '{id_field}' attribute") + case _: + raise TypeError( + f"Invalid type for entity: {type(entity).__name__}. Expected UUID, DeduplicationSet, or Image." + ) + + @staticmethod + def validate_data(data: Any, model_class: Type[TModel]) -> TModel: + match data: + case model_class(): + return data + case dict(): + return model_class.model_validate(data) + case _: + raise TypeError( + f"Expected data to be of type {model_class.__name__} or dict, but got {type(data).__name__}" + ) diff --git a/src/country_workspace/contrib/dedup_engine/apps.py b/src/country_workspace/contrib/dedup_engine/apps.py new file mode 100644 index 0000000..bf79f22 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class Config(AppConfig): + name = __name__.rpartition(".")[0] + verbose_name = "Country Workspace | Dedup Engine Client" diff --git a/src/country_workspace/contrib/dedup_engine/client.py b/src/country_workspace/contrib/dedup_engine/client.py new file mode 100644 index 0000000..302e5ec --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/client.py @@ -0,0 +1,64 @@ +from typing import Type + +from requests import Session +from requests.auth import AuthBase +from requests.models import PreparedRequest + +from country_workspace.contrib.dedup_engine.adapters import ( + DeduplicationSetAdapter, + DuplicateAdapter, + IgnoredAdapter, + ImageAdapter, +) +from country_workspace.contrib.dedup_engine.endpoints import Endpoint +from country_workspace.contrib.dedup_engine.models import ( + DeduplicationSet, + DeduplicationSetCreate, + Duplicate, + Ignored, + IgnoredCreate, + Image, + ImageCreate, +) +from country_workspace.contrib.dedup_engine.types import TCreate, TModel + + +class Auth(AuthBase): + def __init__(self, token: str) -> None: + self._auth_header = f"Token {token}" + + def __call__(self, request: PreparedRequest) -> PreparedRequest: + request.headers["Authorization"] = self._auth_header + return request + + +class HDEAPIClient: + def __init__(self, *, base_url: str, token: str) -> None: + self.session = Session() + self.session.auth = Auth(token) + self.endpoints = Endpoint(base=base_url.rstrip("/")) + + def _get_adapter( + self, adapter_class: type, model_class: Type[TModel], create_class: Type[TCreate] = None, **kwargs + ): + return adapter_class(self.session, self.endpoints, model_class, create_class, **kwargs) + + @property + def deduplication_set(self) -> DeduplicationSetAdapter: + return self._get_adapter(DeduplicationSetAdapter, DeduplicationSet, DeduplicationSetCreate) + + @property + def duplicate(self) -> DuplicateAdapter: + return self._get_adapter(DuplicateAdapter, Duplicate) + + @property + def ignored_filenames(self) -> IgnoredAdapter: + return self._get_adapter(IgnoredAdapter, Ignored, IgnoredCreate, resource_type="filenames") + + @property + def ignored_reference_pks(self) -> IgnoredAdapter: + return self._get_adapter(IgnoredAdapter, Ignored, IgnoredCreate, resource_type="reference_pks") + + @property + def image(self) -> ImageAdapter: + return self._get_adapter(ImageAdapter, Image, ImageCreate) diff --git a/src/country_workspace/contrib/dedup_engine/endpoints.py b/src/country_workspace/contrib/dedup_engine/endpoints.py new file mode 100644 index 0000000..31261be --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/endpoints.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class Endpoint: + base: str + deduplication_set: str = "deduplication_sets/" + deduplication_set_detail: str = "deduplication_sets/{deduplication_set_id}/" + duplicate: str = "deduplication_sets/{deduplication_set_id}/duplicates/" + ignored: str = "deduplication_sets/{deduplication_set_id}/ignored/{resource_type}/" + image: str = "deduplication_sets/{deduplication_set_id}/images/" + image_detail: str = "deduplication_sets/{deduplication_set_id}/images/{image_id}/" + image_bulk: str = "deduplication_sets/{deduplication_set_id}/images_bulk/" + image_bulk_clear: str = "deduplication_sets/{deduplication_set_id}/images_bulk/clear/" + process: str = "deduplication_sets/{deduplication_set_id}/process/" + + def __post_init__(self): + if not self.base.startswith("https://"): + raise ValueError(f"Invalid base URL: '{self.base}'. Must start with 'https://'.") diff --git a/src/country_workspace/contrib/dedup_engine/models/__init__.py b/src/country_workspace/contrib/dedup_engine/models/__init__.py new file mode 100644 index 0000000..60b419f --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/__init__.py @@ -0,0 +1,4 @@ +from .deduplication_set import DeduplicationSet, DeduplicationSetCreate # noqa +from .duplicate import Duplicate # noqa +from .ignored import Ignored, IgnoredCreate # noqa +from .image import Image, ImageCreate # noqa diff --git a/src/country_workspace/contrib/dedup_engine/models/constants.py b/src/country_workspace/contrib/dedup_engine/models/constants.py new file mode 100644 index 0000000..9d9613f --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/constants.py @@ -0,0 +1,3 @@ +from typing import Final + +REFERENCE_PK_LENGTH: Final[int] = 100 diff --git a/src/country_workspace/contrib/dedup_engine/models/deduplication_set.py b/src/country_workspace/contrib/dedup_engine/models/deduplication_set.py new file mode 100644 index 0000000..e101056 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/deduplication_set.py @@ -0,0 +1,32 @@ +from datetime import datetime +from typing import Annotated, Any +from uuid import UUID + +from pydantic import BaseModel, Field + +from .constants import REFERENCE_PK_LENGTH +from .util import DatetimeEncoderMixin, DeduplicationSetStatus, StatusEncoderMixin, merge_configs + + +class DeduplicationSetConfig(BaseModel): + name: Annotated[str | None, Field(max_length=128)] = None + settings: dict[str, Any] | None = None + + +class DeduplicationSetCreate(BaseModel): + reference_pk: Annotated[str, Field(max_length=REFERENCE_PK_LENGTH)] + name: Annotated[str | None, Field(max_length=128)] = None + description: str | None = None + notification_url: Annotated[str | None, Field(max_length=255)] = None + + +@merge_configs(DatetimeEncoderMixin, StatusEncoderMixin) +class DeduplicationSet(DeduplicationSetCreate): + id: UUID + state: DeduplicationSetStatus + config: DeduplicationSetConfig | None = None + created_at: datetime + updated_at: datetime | None = None + external_system: str + created_by: int | None = None + updated_by: int | None = None diff --git a/src/country_workspace/contrib/dedup_engine/models/duplicate.py b/src/country_workspace/contrib/dedup_engine/models/duplicate.py new file mode 100644 index 0000000..d93dba1 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/duplicate.py @@ -0,0 +1,15 @@ +from typing import Annotated + +from pydantic import BaseModel, Field + +from .constants import REFERENCE_PK_LENGTH + + +class DuplicateReference(BaseModel): + reference_pk: Annotated[str, Field(max_length=REFERENCE_PK_LENGTH)] + + +class Duplicate(BaseModel): + first: DuplicateReference + second: DuplicateReference + score: Annotated[float, Field(ge=0, le=1, description="Similarity score must be between 0 and 1")] diff --git a/src/country_workspace/contrib/dedup_engine/models/ignored.py b/src/country_workspace/contrib/dedup_engine/models/ignored.py new file mode 100644 index 0000000..219939c --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/ignored.py @@ -0,0 +1,16 @@ +from typing import Annotated +from uuid import UUID + +from pydantic import BaseModel, Field + +from .constants import REFERENCE_PK_LENGTH + + +class IgnoredCreate(BaseModel): + first: Annotated[str, Field(max_length=REFERENCE_PK_LENGTH)] + second: Annotated[str, Field(max_length=REFERENCE_PK_LENGTH)] + + +class Ignored(IgnoredCreate): + id: int + deduplication_set: UUID diff --git a/src/country_workspace/contrib/dedup_engine/models/image.py b/src/country_workspace/contrib/dedup_engine/models/image.py new file mode 100644 index 0000000..1f463ec --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/image.py @@ -0,0 +1,21 @@ +from datetime import datetime +from typing import Annotated +from uuid import UUID + +from pydantic import BaseModel, Field + +from .constants import REFERENCE_PK_LENGTH +from .util import DatetimeEncoderMixin, merge_configs + + +class ImageCreate(BaseModel): + reference_pk: Annotated[str, Field(max_length=REFERENCE_PK_LENGTH)] + filename: Annotated[str, Field(max_length=255)] + + +@merge_configs(DatetimeEncoderMixin) +class Image(ImageCreate): + id: UUID + deduplication_set: UUID + created_by: int | None = None + created_at: datetime diff --git a/src/country_workspace/contrib/dedup_engine/models/util.py b/src/country_workspace/contrib/dedup_engine/models/util.py new file mode 100644 index 0000000..370fc9e --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/models/util.py @@ -0,0 +1,58 @@ +from datetime import datetime +from enum import Enum +from typing import Self, Type, TypeVar + +from pydantic import ConfigDict + +T = TypeVar("T") + + +class DeduplicationSetStatus(Enum): + CLEAN = "Clean" + DIRTY = "Dirty" + + @property + def description(self) -> str: + descriptions = { + DeduplicationSetStatus.CLEAN: "Deduplication set is created or already processed", + DeduplicationSetStatus.DIRTY: "Deduplication set needs processing", + } + if self not in descriptions: + raise ValueError(f"Description for status {self} is not defined.") + return descriptions[self] + + @classmethod + def get_description(cls, status: Self) -> str: + if not isinstance(status, cls): + raise ValueError(f"Invalid status: {status}") + return status.description + + +def merge_configs(*mixins) -> Type[T]: + def decorator(cls: Type[T]) -> Type[T]: + merged_config = {} + for mixin in mixins: + for k, v in mixin.get_config().items(): + if k not in merged_config: + merged_config[k] = v + elif isinstance(v, dict) and isinstance(merged_config[k], dict): + merged_config[k].update(v) + else: + raise ValueError(f"Conflict in config key '{k}' for {cls.__name__}") + + cls.model_config = ConfigDict(**merged_config) + return cls + + return decorator + + +class DatetimeEncoderMixin: + @staticmethod + def get_config() -> ConfigDict: + return ConfigDict(json_encoders={datetime: lambda v: v.isoformat()}) + + +class StatusEncoderMixin: + @staticmethod + def get_config() -> ConfigDict: + return ConfigDict(json_encoders={DeduplicationSetStatus: lambda v: v.value}) diff --git a/src/country_workspace/contrib/dedup_engine/types.py b/src/country_workspace/contrib/dedup_engine/types.py new file mode 100644 index 0000000..7310fa1 --- /dev/null +++ b/src/country_workspace/contrib/dedup_engine/types.py @@ -0,0 +1,22 @@ +from typing import TypeVar + +from country_workspace.contrib.dedup_engine.models import ( + DeduplicationSet, + DeduplicationSetCreate, + Duplicate, + Ignored, + IgnoredCreate, + Image, + ImageCreate, +) + +TModel = TypeVar( + "TModel", DeduplicationSet, DeduplicationSetCreate, Duplicate, Ignored, IgnoredCreate, Image, ImageCreate +) + +TCreate = TypeVar( + "TCreate", + DeduplicationSetCreate, + IgnoredCreate, + ImageCreate, +) diff --git a/uv.lock b/uv.lock index 4417aab..69ac6f7 100644 --- a/uv.lock +++ b/uv.lock @@ -17,6 +17,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/26/99/fc813cd978842c26c82534010ea849eee9ab3a13ea2b74e95cb9c99e747b/amqp-5.3.1-py3-none-any.whl", hash = "sha256:43b3319e1b4e7d1251833a93d672b4af1e40f3d632d479b98661a95f117880a2", size = 50944 }, ] +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, +] + [[package]] name = "asgiref" version = "3.8.1" @@ -1161,6 +1170,7 @@ dependencies = [ { name = "openpyxl" }, { name = "psycopg2-binary" }, { name = "python-redis-lock", extra = ["django"] }, + { name = "pydantic" }, { name = "redis" }, { name = "sentry-sdk" }, { name = "social-auth-app-django" }, @@ -1257,6 +1267,7 @@ requires-dist = [ { name = "openpyxl", specifier = ">=3.1.5" }, { name = "psycopg2-binary", specifier = ">=2.9.9" }, { name = "python-redis-lock", extras = ["django"], specifier = ">=4.0.0" }, + { name = "pydantic", specifier = ">=2.10.1" }, { name = "redis" }, { name = "sentry-sdk", specifier = ">=2.7.1" }, { name = "social-auth-app-django" }, @@ -2161,6 +2172,45 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552 }, ] +[[package]] +name = "pydantic" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/bd/7fc610993f616d2398958d0028d15eaf53bde5f80cb2edb7aa4f1feaf3a7/pydantic-2.10.1.tar.gz", hash = "sha256:a4daca2dc0aa429555e0656d6bf94873a7dc5f54ee42b1f5873d666fb3f35560", size = 783717 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/fc/fda48d347bd50a788dd2a0f318a52160f911b86fc2d8b4c86f4d7c9bceea/pydantic-2.10.1-py3-none-any.whl", hash = "sha256:a8d20db84de64cf4a7d59e899c2caf0fe9d660c7cfc482528e7020d7dd189a7e", size = 455329 }, +] + +[[package]] +name = "pydantic-core" +version = "2.27.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/9f/7de1f19b6aea45aeb441838782d68352e71bfa98ee6fa048d5041991b33e/pydantic_core-2.27.1.tar.gz", hash = "sha256:62a763352879b84aa31058fc931884055fd75089cccbd9d58bb6afd01141b235", size = 412785 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/51/2e9b3788feb2aebff2aa9dfbf060ec739b38c05c46847601134cc1fed2ea/pydantic_core-2.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9cbd94fc661d2bab2bc702cddd2d3370bbdcc4cd0f8f57488a81bcce90c7a54f", size = 1895239 }, + { url = "https://files.pythonhosted.org/packages/7b/9e/f8063952e4a7d0127f5d1181addef9377505dcce3be224263b25c4f0bfd9/pydantic_core-2.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f8c4718cd44ec1580e180cb739713ecda2bdee1341084c1467802a417fe0f02", size = 1805070 }, + { url = "https://files.pythonhosted.org/packages/2c/9d/e1d6c4561d262b52e41b17a7ef8301e2ba80b61e32e94520271029feb5d8/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15aae984e46de8d376df515f00450d1522077254ef6b7ce189b38ecee7c9677c", size = 1828096 }, + { url = "https://files.pythonhosted.org/packages/be/65/80ff46de4266560baa4332ae3181fffc4488ea7d37282da1a62d10ab89a4/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ba5e3963344ff25fc8c40da90f44b0afca8cfd89d12964feb79ac1411a260ac", size = 1857708 }, + { url = "https://files.pythonhosted.org/packages/d5/ca/3370074ad758b04d9562b12ecdb088597f4d9d13893a48a583fb47682cdf/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:992cea5f4f3b29d6b4f7f1726ed8ee46c8331c6b4eed6db5b40134c6fe1768bb", size = 2037751 }, + { url = "https://files.pythonhosted.org/packages/b1/e2/4ab72d93367194317b99d051947c071aef6e3eb95f7553eaa4208ecf9ba4/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0325336f348dbee6550d129b1627cb8f5351a9dc91aad141ffb96d4937bd9529", size = 2733863 }, + { url = "https://files.pythonhosted.org/packages/8a/c6/8ae0831bf77f356bb73127ce5a95fe115b10f820ea480abbd72d3cc7ccf3/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7597c07fbd11515f654d6ece3d0e4e5093edc30a436c63142d9a4b8e22f19c35", size = 2161161 }, + { url = "https://files.pythonhosted.org/packages/f1/f4/b2fe73241da2429400fc27ddeaa43e35562f96cf5b67499b2de52b528cad/pydantic_core-2.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3bbd5d8cc692616d5ef6fbbbd50dbec142c7e6ad9beb66b78a96e9c16729b089", size = 1993294 }, + { url = "https://files.pythonhosted.org/packages/77/29/4bb008823a7f4cc05828198153f9753b3bd4c104d93b8e0b1bfe4e187540/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:dc61505e73298a84a2f317255fcc72b710b72980f3a1f670447a21efc88f8381", size = 2001468 }, + { url = "https://files.pythonhosted.org/packages/f2/a9/0eaceeba41b9fad851a4107e0cf999a34ae8f0d0d1f829e2574f3d8897b0/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:e1f735dc43da318cad19b4173dd1ffce1d84aafd6c9b782b3abc04a0d5a6f5bb", size = 2091413 }, + { url = "https://files.pythonhosted.org/packages/d8/36/eb8697729725bc610fd73940f0d860d791dc2ad557faaefcbb3edbd2b349/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f4e5658dbffe8843a0f12366a4c2d1c316dbe09bb4dfbdc9d2d9cd6031de8aae", size = 2154735 }, + { url = "https://files.pythonhosted.org/packages/52/e5/4f0fbd5c5995cc70d3afed1b5c754055bb67908f55b5cb8000f7112749bf/pydantic_core-2.27.1-cp312-none-win32.whl", hash = "sha256:672ebbe820bb37988c4d136eca2652ee114992d5d41c7e4858cdd90ea94ffe5c", size = 1833633 }, + { url = "https://files.pythonhosted.org/packages/ee/f2/c61486eee27cae5ac781305658779b4a6b45f9cc9d02c90cb21b940e82cc/pydantic_core-2.27.1-cp312-none-win_amd64.whl", hash = "sha256:66ff044fd0bb1768688aecbe28b6190f6e799349221fb0de0e6f4048eca14c16", size = 1986973 }, + { url = "https://files.pythonhosted.org/packages/df/a6/e3f12ff25f250b02f7c51be89a294689d175ac76e1096c32bf278f29ca1e/pydantic_core-2.27.1-cp312-none-win_arm64.whl", hash = "sha256:9a3b0793b1bbfd4146304e23d90045f2a9b5fd5823aa682665fbdaf2a6c28f3e", size = 1883215 }, +] + [[package]] name = "pyflakes" version = "3.2.0"