-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Working debug in unit tests, removed a bunch of hard-coded references…
… to SHA-1, first working unit test
- Loading branch information
Will Langdale
committed
Oct 17, 2024
1 parent
e4f60b6
commit 62040c4
Showing
23 changed files
with
305 additions
and
257 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"version": "0.2.0", | ||
"configurations": [ | ||
{ | ||
"name": "Matchbox: Debug", | ||
"type": "debugpy", | ||
"request": "launch", | ||
"program": "${file}", | ||
"purpose": ["debug-test"], | ||
"console": "integratedTerminal", | ||
"justMyCode": false, | ||
"env": { | ||
"PYTEST_ADDOPTS": "--no-cov", | ||
"PYTHONPATH": "${workspaceFolder}" | ||
}, | ||
"python": "${workspaceFolder}/.venv/bin/python", | ||
"cwd": "${workspaceFolder}", | ||
"args": [ | ||
"-v", | ||
"-s" | ||
] | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import hashlib | ||
from typing import Any, TypeVar | ||
from uuid import UUID | ||
|
||
from matchbox.server.base import IndexableDataset | ||
from pandas import DataFrame, Series | ||
from sqlalchemy import String, func, select | ||
from sqlalchemy.orm import Session | ||
|
||
T = TypeVar("T") | ||
HashableItem = TypeVar("HashableItem", bytes, bool, str, int, float, bytearray) | ||
|
||
HASH_FUNC = hashlib.sha1 | ||
|
||
|
||
def dataset_to_hashlist(dataset: IndexableDataset, uuid: UUID) -> list[dict[str, Any]]: | ||
"""Retrieve and hash a dataset from its warehouse, ready to be inserted.""" | ||
with Session(dataset.database.engine) as warehouse_session: | ||
source_table = dataset.to_table() | ||
|
||
# Exclude the primary key from the columns to be hashed | ||
cols = tuple( | ||
[col for col in list(source_table.c.keys()) if col != dataset.db_pk] | ||
) | ||
|
||
slct_stmt = select( | ||
func.concat(*source_table.c[cols]).label("raw"), | ||
func.array_agg(source_table.c[dataset.db_pk].cast(String)).label("id"), | ||
).group_by(*source_table.c[cols]) | ||
|
||
raw_result = warehouse_session.execute(slct_stmt) | ||
|
||
to_insert = [ | ||
{ | ||
"sha1": hash_data(data.raw), | ||
"id": data.id, | ||
"dataset": uuid, | ||
} | ||
for data in raw_result.all() | ||
] | ||
|
||
return to_insert | ||
|
||
|
||
def prep_for_hash(item: HashableItem) -> bytes: | ||
"""Encodes strings so they can be hashed, otherwises, passes through.""" | ||
if isinstance(item, bytes): | ||
return item | ||
elif isinstance(item, str): | ||
return bytes(item.encode()) | ||
elif isinstance(item, UUID): | ||
return item.bytes | ||
else: | ||
return bytes(item) | ||
|
||
|
||
def hash_data(data: str) -> bytes: | ||
""" | ||
Hash the given data using the globally defined hash function. | ||
This function ties into the existing hashing utilities. | ||
""" | ||
return HASH_FUNC(prep_for_hash(data)).digest() | ||
|
||
|
||
def list_to_value_ordered_hash(list_: list[T]) -> bytes: | ||
"""Returns a single hash of a list ordered by its values. | ||
List must be sorted as the different orders of value must produce the same hash. | ||
""" | ||
try: | ||
sorted_vals = sorted(list_) | ||
except TypeError as e: | ||
raise TypeError("Can only order lists or columns of the same datatype.") from e | ||
|
||
hashed_vals_list = [HASH_FUNC(prep_for_hash(i)) for i in sorted_vals] | ||
Check failure Code scanning / CodeQL Use of a broken or weak cryptographic hashing algorithm on sensitive data High Sensitive data (id) Error loading related location Loading Sensitive data (id) Error loading related location Loading Sensitive data (id) Error loading related location Loading |
||
|
||
hashed_vals = hashed_vals_list[0] | ||
for val in hashed_vals_list[1:]: | ||
hashed_vals.update(val.digest()) | ||
|
||
return hashed_vals.digest() | ||
|
||
|
||
def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series: | ||
"""Returns the rowwise hash ordered by the row's values, ignoring column order. | ||
This function is used to add a column to a dataframe that represents the | ||
hash of each its rows, but where the order of the row values doesn't change the | ||
hash value. Column order is ignored in favour of value order. | ||
This is primarily used to give a consistent hash to a new cluster no matter whether | ||
its parent hashes were used in the left or right table. | ||
""" | ||
bytes_records = data.filter(columns).astype(bytes).to_dict("records") | ||
|
||
hashed_records = [] | ||
|
||
for record in bytes_records: | ||
hashed_vals = list_to_value_ordered_hash(record.values()) | ||
hashed_records.append(hashed_vals) | ||
|
||
return Series(hashed_records) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.