Skip to content

Commit

Permalink
Working hash validation test
Browse files Browse the repository at this point in the history
  • Loading branch information
Will Langdale committed Oct 21, 2024
1 parent 6af27c9 commit fbde0bd
Showing 1 changed file with 67 additions and 116 deletions.
183 changes: 67 additions & 116 deletions test/server/test_adapter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pytest
from dotenv import find_dotenv, load_dotenv
from matchbox.common.exceptions import MatchboxDBDataError
from matchbox.common.hash import HASH_FUNC
from matchbox.helpers.selector import query, selector, selectors
from matchbox.server.models import Source
from matchbox.server.postgresql import MatchboxPostgres
from pandas import DataFrame
from pytest import FixtureRequest

from ..fixtures.db import (
AddDedupeModelsAndDataCallable,
Expand Down Expand Up @@ -138,7 +140,7 @@ def test_query_with_dedupe_model(
db_add_dedupe_models_and_data: AddDedupeModelsAndDataCallable,
db_add_indexed_data: AddIndexedDataCallable,
warehouse_data: list[Source],
request: FixtureRequest,
request: pytest.FixtureRequest,
):
"""Test querying data from a deduplication point of truth."""
# Setup
Expand Down Expand Up @@ -185,7 +187,7 @@ def test_query_with_link_model(
db_add_indexed_data: AddIndexedDataCallable,
db_add_link_models_and_data: AddLinkModelsAndDataCallable,
warehouse_data: list[Source],
request: FixtureRequest,
request: pytest.FixtureRequest,
):
"""Test querying data from a link point of truth."""
# Setup
Expand Down Expand Up @@ -239,44 +241,80 @@ def test_query_with_link_model(
assert crn_duns.cluster_hash.nunique() == 1000


def test_validate_hashes(matchbox_postgres):
def test_validate_data_hashes():
pass
def test_validate_hashes(
matchbox_postgres: MatchboxPostgres,
db_add_dedupe_models_and_data: AddDedupeModelsAndDataCallable,
db_add_indexed_data: AddIndexedDataCallable,
warehouse_data: list[Source],
request: pytest.FixtureRequest,
):
"""Test validating data hashes."""
# Setup
db_add_dedupe_models_and_data(
db_add_indexed_data=db_add_indexed_data,
backend=matchbox_postgres,
warehouse_data=warehouse_data,
dedupe_data=dedupe_data_test_params,
dedupe_models=[dedupe_model_test_params[0]], # Naive deduper,
request=request,
)

def test_validate_cluster_hashes():
pass
crn = warehouse_data[0]
select_crn = selector(
table=str(crn),
fields=["company_name", "crn"],
engine=crn.database.engine,
)
df_crn = query(
selector=select_crn,
backend=matchbox_postgres,
model="naive_test.crn",
return_type="pandas",
)

def test_validate_nonexistent_hashes():
pass
# Test validating data hashes
matchbox_postgres.validate_hashes(
hashes=df_crn.data_hash.to_list(), hash_type="data"
)

# Test validating cluster hashes
matchbox_postgres.validate_hashes(
hashes=df_crn.cluster_hash.drop_duplicates().to_list(), hash_type="cluster"
)

# Test validating nonexistant hashes errors
with pytest.raises(MatchboxDBDataError):
matchbox_postgres.validate_hashes(
hashes=[HASH_FUNC(b"nonexistant").digest()], hash_type="data"
)


def test_get_dataset(matchbox_postgres):
def test_get_dataset(matchbox_postgres: MatchboxPostgres):
# Test getting an existing model
pass


def test_get_model_subgraph(matchbox_postgres):
def test_get_model_subgraph(matchbox_postgres: MatchboxPostgres):
# Test getting the model subgraph
pass


def test_get_model(matchbox_postgres):
def test_get_model(matchbox_postgres: MatchboxPostgres):
# Test getting an existing model
pass


def test_delete_model(matchbox_postgres):
def test_delete_existing_model():
pass
def test_delete_leaf_model(matchbox_postgres: MatchboxPostgres):
"""Test deletion of a model with no dependencies."""
pass

def test_delete_nonexistent_model():
pass

def test_delete_model_without_confirmation():
pass
def test_delete_node_model(matchbox_postgres: MatchboxPostgres):
"""Test deletion of a model with downstream dependencies."""
pass


def test_insert_model(matchbox_postgres):
def test_insert_model(matchbox_postgres: MatchboxPostgres):
def test_insert_deduper_model():
pass

Expand All @@ -290,120 +328,33 @@ def test_insert_duplicate_model():
# Additional tests for other properties and methods


def test_datasets_property(matchbox_postgres):
def test_datasets_property(matchbox_postgres: MatchboxPostgres):
pass


def test_models_property(matchbox_postgres):
def test_models_property(matchbox_postgres: MatchboxPostgres):
pass


def test_models_from_property(matchbox_postgres):
def test_models_from_property(matchbox_postgres: MatchboxPostgres):
pass


def test_data_property(matchbox_postgres):
def test_data_property(matchbox_postgres: MatchboxPostgres):
pass


def test_clusters_property(matchbox_postgres):
def test_clusters_property(matchbox_postgres: MatchboxPostgres):
pass


def test_creates_property(matchbox_postgres):
def test_creates_property(matchbox_postgres: MatchboxPostgres):
pass


def test_merges_property(matchbox_postgres):
def test_merges_property(matchbox_postgres: MatchboxPostgres):
pass


def test_proposes_property(matchbox_postgres):
def test_proposes_property(matchbox_postgres: MatchboxPostgres):
pass


# def test_add_dedupers_and_data(
# db_engine, db_clear_models, db_add_dedupe_models_and_data, request
# ):
# """
# Test that adding models and generated data for deduplication processes works.
# """
# db_clear_models(db_engine)
# db_add_dedupe_models_and_data(
# db_engine=db_engine,
# dedupe_data=dedupe_data_test_params,
# dedupe_models=[dedupe_model_test_params[0]], # Naive deduper
# request=request,
# )

# dedupe_test_params_dict = {
# test_param.source: test_param for test_param in dedupe_data_test_params
# }

# with Session(db_engine) as session:
# model_list = session.query(Models).all()

# assert len(model_list) == len(dedupe_data_test_params)

# for model in model_list:
# deduplicates = (
# session.query(SourceDataset.db_schema, SourceDataset.db_table)
# .filter(SourceDataset.uuid == model.deduplicates)
# .first()
# )

# test_param = dedupe_test_params_dict[
# f"{deduplicates[0]}.{deduplicates[1]}"
# ]

# assert session.scalar(model.dedupes_count()) == test_param.tgt_prob_n
# # We assert unique_n rather than tgt_clus_n because tgt_clus_n
# # checks what the deduper found, not what was inserted
# assert session.scalar(model.creates_count()) == test_param.unique_n

# db_clear_models(db_engine)


# def test_add_linkers_and_data(
# db_engine,
# db_clear_models,
# db_add_dedupe_models_and_data,
# db_add_link_models_and_data,
# request,
# ):
# """
# Test that adding models and generated data for link processes works.
# """
# naive_deduper_params = [dedupe_model_test_params[0]] # Naive deduper
# deterministic_linker_params = [link_model_test_params[0]] # Deterministic linker

# db_clear_models(db_engine)
# db_add_link_models_and_data(
# db_engine=db_engine,
# db_add_dedupe_models_and_data=db_add_dedupe_models_and_data,
# dedupe_data=dedupe_data_test_params,
# dedupe_models=naive_deduper_params,
# link_data=link_data_test_params,
# link_models=deterministic_linker_params,
# request=request,
# )

# with Session(db_engine) as session:
# model_list = session.query(Models).filter(Models.deduplicates == None).all() # NoQA E711

# assert len(model_list) == len(link_data_test_params)

# for fx_linker, fx_data in itertools.product(
# deterministic_linker_params, link_data_test_params
# ):
# linker_name = f"{fx_linker.name}_{fx_data.source_l}_{fx_data.source_r}"

# with Session(db_engine) as session:
# model = session.query(Models).filter(Models.name == linker_name).first()

# assert session.scalar(model.links_count()) == fx_data.tgt_prob_n
# # We assert unique_n rather than tgt_clus_n because tgt_clus_n
# # checks what the linker found, not what was inserted
# assert session.scalar(model.creates_count()) == fx_data.unique_n

# db_clear_models(db_engine)

0 comments on commit fbde0bd

Please sign in to comment.