diff --git a/sample.datasets.toml b/sample.datasets.toml index ccc84a0d..e0dda57e 100644 --- a/sample.datasets.toml +++ b/sample.datasets.toml @@ -1,10 +1,10 @@ [warehouses.pg_warehouse] db_type = "postgresql" -username = "your_username" -password = "your_password" +database = "warehouse" +user = "warehouse_user" +password = "warehouse_password" host = "localhost" -port = 5432 -database = "companies_db" +port = 7654 [datasets.companies_house] database = "pg_warehouse" diff --git a/src/matchbox/admin.py b/src/matchbox/admin.py index 725eb1da..2a93d292 100644 --- a/src/matchbox/admin.py +++ b/src/matchbox/admin.py @@ -5,19 +5,26 @@ import tomli from dotenv import find_dotenv, load_dotenv +from matchbox.server import MatchboxDBAdapter, inject_backend from matchbox.server.base import ( - MatchboxSettings, Source, ) from matchbox.server.models import SourceWarehouse +logger = logging.getLogger("mb_logic") + dotenv_path = find_dotenv(usecwd=True) load_dotenv(dotenv_path) +@inject_backend +def index_dataset(backend: MatchboxDBAdapter, dataset: Source) -> None: + backend.index(dataset=dataset) + + def load_datasets_from_config(datasets: Path) -> dict[str, Source]: """Loads datasets for indexing from the datasets settings TOML file.""" - config = tomli.load(datasets) + config = tomli.loads(datasets.read_text()) warehouses: dict[str, SourceWarehouse] = {} for alias, warehouse_config in config["warehouses"].items(): @@ -36,12 +43,14 @@ def load_datasets_from_config(datasets: Path) -> dict[str, Source]: @click.argument( "datasets", type=click.Path(exists=True, dir_okay=False, path_type=Path) ) -def make_cmf(datasets: Path) -> None: - backend = MatchboxSettings().backend +@inject_backend +def make_matchbox(backend: MatchboxDBAdapter, datasets: Path) -> None: dataset_dict = load_datasets_from_config(datasets=datasets) for dataset in dataset_dict.values(): - backend.index(dataset=dataset, engine=dataset.database.engine) + logger.info(f"Indexing {dataset}") + index_dataset(dataset) + logger.info(f"Finished indexing {dataset}") if __name__ == "__main__": @@ -49,4 +58,4 @@ def make_cmf(datasets: Path) -> None: level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - make_cmf() + make_matchbox() diff --git a/src/matchbox/server/base.py b/src/matchbox/server/base.py index 9e17a81d..9b472f47 100644 --- a/src/matchbox/server/base.py +++ b/src/matchbox/server/base.py @@ -14,6 +14,7 @@ cast, ) +from dotenv import find_dotenv, load_dotenv from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict from rustworkx import PyDiGraph @@ -36,6 +37,9 @@ Results = Any +dotenv_path = find_dotenv(usecwd=True) +load_dotenv(dotenv_path, override=True) + R = TypeVar("R") P = ParamSpec("P") diff --git a/src/matchbox/server/models.py b/src/matchbox/server/models.py index 4c63ff11..62603a00 100644 --- a/src/matchbox/server/models.py +++ b/src/matchbox/server/models.py @@ -155,7 +155,8 @@ def _select( def to_hash(self) -> bytes: """Generate a unique hash based on the table's columns and datatypes.""" table = self.to_table() - schema_representation = ",".join( + + schema_representation = f"{str(self)}: " + ",".join( f"{col.name}:{str(col.type)}" for col in table.columns ) return HASH_FUNC(schema_representation.encode("utf-8")).digest()