Skip to content

Commit

Permalink
Removed wildcard selector and declaring column type
Browse files Browse the repository at this point in the history
  • Loading branch information
wpfl-dbt committed Dec 12, 2024
1 parent f55866e commit 3e75fb5
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 53 deletions.
29 changes: 11 additions & 18 deletions src/matchbox/common/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
Handles three scenarios:
1. No columns specified - all columns except primary key are indexed
2. Indices from database - uses existing column hash information
3. Columns specified in TOML - uses specified columns with optional '*'
3. Columns specified in TOML - specified columns are indexed
"""
# Initialise warehouse and get table metadata
warehouse = (
Expand Down Expand Up @@ -280,14 +280,16 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":

# Case 3: Columns from TOML
local_columns = []
star_index = None

# Process TOML column specifications
for i, column in enumerate(index_data):
if column["literal"] == "*":
star_index = i
continue
local_columns.append(SourceColumn(**column, indexed=True))
for column in index_data:
local_columns.append(
SourceColumn(
literal=column["literal"],
alias=column.get("alias", column["literal"]),
indexed=True,
)
)

# Match remote columns with local specifications
indexed_columns = []
Expand All @@ -296,23 +298,14 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
for remote_col in remote_columns:
matched = False
for local_col in local_columns:
if remote_col == local_col:
if local_col.type is None:
local_col.type = remote_col.type
if remote_col.literal == local_col.literal:
indexed_columns.append(local_col)
matched = True
break
if not matched:
non_indexed_columns.append(remote_col)

# Handle wildcard insertion
if star_index is not None:
for col in non_indexed_columns:
col.indexed = True
indexed_columns[star_index:star_index] = non_indexed_columns
data["db_columns"] = indexed_columns
else:
data["db_columns"] = indexed_columns + non_indexed_columns
data["db_columns"] = indexed_columns + non_indexed_columns

return data

Expand Down
43 changes: 8 additions & 35 deletions test/client/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,9 @@ def source_toml(source: Source, index: list[dict[str, str]]) -> str:
{"literal": "company_name"},
{"literal": "crn"},
],
[{"literal": "company_name", "type": "VARCHAR", "alias": "name"}],
[
{"literal": "company_name"},
{"literal": "*"},
],
[
{"literal": "*"},
{"literal": "company_name"},
],
[{"literal": "company_name", "alias": "name"}],
),
ids=["vanilla", "alias_and_type", "star_end", "star_start"],
ids=["vanilla", "alias"],
)
def test_load_datasets_from_config(
index: list[dict[str, str]],
Expand All @@ -76,9 +68,7 @@ def test_load_datasets_from_config(

# Helper variables
source = config.get(crn.alias)
named = [idx["literal"] for idx in index if idx["literal"] != "*"]
has_star = any(idx["literal"] == "*" for idx in index)
star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None)
named = [idx["literal"] for idx in index]
col_names = [col.literal.name for col in source.db_columns]

# Test 1: Core attributes match
Expand All @@ -91,33 +81,16 @@ def test_load_datasets_from_config(
# Test 2: All non-pk columns present
assert set(col_names) == {"company_name", "crn", "id"} - {source.db_pk}

# Test 3: Column indexing
for col in source.db_columns:
assert col.indexed == (has_star or col.literal.name in named)

# Test 4: Aliases and types match
# Test 3: Aliases match
for idx in index:
if idx["literal"] == "*":
continue
col = next(c for c in source.db_columns if c.literal.name == idx["literal"])
assert col.alias.name == idx.get("alias", idx["literal"])
assert col.type == idx.get("type", col.type)

# Test 5: Column ordering
if star_pos is None:
for i, name in enumerate(named):
assert col_names[i] == name
else:
for i, idx in enumerate(index):
if idx["literal"] != "*":
if i < star_pos:
assert col_names[i] == idx["literal"]
else:
star_col_count = len(col_names) - len(index) + 1
assert col_names[i + star_col_count - 1] == idx["literal"]

# Test 6: column equalities
# Test 4: Column ordering
for i, name in enumerate(named):
assert col_names[i] == name

# Test 5: column equalities
assert source.db_columns[0] != source.db_columns[1]
assert source.db_columns[0] == source.db_columns[0]
assert source.db_columns[1].literal.hash == source.db_columns[1]
Expand Down

0 comments on commit 3e75fb5

Please sign in to comment.