Skip to content

Commit

Permalink
Merge pull request #1642 from moj-analytical-services/update_sqlglot
Browse files Browse the repository at this point in the history
Update sqlglot to >=13.0.0
  • Loading branch information
ThomasHepworth authored Oct 11, 2023
2 parents 7489d75 + 23b17fe commit 26d7148
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 69 deletions.
12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ jsonschema = ">=3.2,<5.0"
# 1.3.5 is the last version supporting py 3.7.1
pandas = ">1.3.0"
duckdb = ">=0.8.0"
# normalize issue in sqlglot - temporarily exclude updates
sqlglot = ">=7.0.0,<11.4.2"
sqlglot = ">=13.0.0, <19.0.0"
altair = "^5.0.1"
Jinja2 = ">=3.0.3"
phonetics = "^1.0.5"
Expand Down
5 changes: 3 additions & 2 deletions splink/comparison_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sqlglot
from sqlglot.expressions import Identifier
from sqlglot.optimizer.normalize import normalize
from sqlglot.optimizer.simplify import simplify

from .constants import LEVEL_NOT_OBSERVED_TEXT
from .default_from_jsonschema import default_value_from_schema
Expand Down Expand Up @@ -495,7 +496,7 @@ def _is_exact_match(self):
sql_syntax_tree = sqlglot.parse_one(
self.sql_condition.lower(), read=self.sql_dialect
)
sql_cnf = normalize(sql_syntax_tree)
sql_cnf = simplify(normalize(sql_syntax_tree))

exprs = _get_and_subclauses(sql_cnf)
for expr in exprs:
Expand All @@ -508,7 +509,7 @@ def _exact_match_colnames(self):
sql_syntax_tree = sqlglot.parse_one(
self.sql_condition.lower(), read=self.sql_dialect
)
sql_cnf = normalize(sql_syntax_tree)
sql_cnf = simplify(normalize(sql_syntax_tree))

exprs = _get_and_subclauses(sql_cnf)
for expr in exprs:
Expand Down
9 changes: 3 additions & 6 deletions splink/input_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,11 @@ def _get_dialect_quotes(dialect):


def _get_sqlglot_dialect_quotes(dialect: sqlglot.Dialect):
# TODO: once we drop support for sqlglot < 6.0.0, we can simplify this
try:
# For sqlglot < 6.0.0
quotes = dialect.identifiers
quote = '"' if '"' in quotes else quotes[0]
start = end = quote
# For sqlglot >= 16.0.0
start = dialect.IDENTIFIER_START
end = dialect.IDENTIFIER_END
except AttributeError:
# For sqlglot >= 6.0.0
start = dialect.identifier_start
end = dialect.identifier_end
return start, end
85 changes: 85 additions & 0 deletions tests/test_comparison_level.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from pytest import mark, raises

from splink.comparison_level import ComparisonLevel

from .decorator import mark_with_dialects_excluding


def make_comparison_level(sql_condition, dialect):
return ComparisonLevel(
{
"sql_condition": sql_condition,
"label_for_charts": "nice_informative_label",
},
sql_dialect=dialect,
)


# SQL conditions that are of 'exact match' type
exact_matchy_sql_conditions_and_columns = [
("col_l = col_r", {"col"}),
("col_l = col_r AND another_col_l = another_col_r", {"col", "another_col"}),
(
"col_l = col_r AND another_col_l = another_col_r AND third_l = third_r",
{"col", "another_col", "third"},
),
(
"(col_l = col_r AND another_col_l = another_col_r) AND third_l = third_r",
{"col", "another_col", "third"},
),
(
"col_l = col_r AND (another_col_l = another_col_r AND third_l = third_r)",
{"col", "another_col", "third"},
),
]


@mark.parametrize(
"sql_condition, exact_match_cols", exact_matchy_sql_conditions_and_columns
)
@mark_with_dialects_excluding()
def test_is_exact_match_for_exact_matchy_levels(
sql_condition, exact_match_cols, dialect
):
lev = make_comparison_level(sql_condition, dialect)
assert lev._is_exact_match


@mark.parametrize(
"sql_condition, exact_match_cols", exact_matchy_sql_conditions_and_columns
)
@mark_with_dialects_excluding()
def test_exact_match_colnames_for_exact_matchy_levels(
sql_condition, exact_match_cols, dialect
):
lev = make_comparison_level(sql_condition, dialect)
assert set(lev._exact_match_colnames) == exact_match_cols


# SQL conditions that are NOT of 'exact match' type
non_exact_matchy_sql_conditions = [
"levenshtein(col_l, col_r) < 3",
"col_l < col_r",
"col_l = col_r OR another_col_l = another_col_r",
"col_l = a_different_col_r",
"col_l = col_r AND (col_2_l = col_2_r OR col_3_l = col_3_r)",
"col_l = col_r AND (col_2_l < col_2_r)",
"substr(col_l, 2) = substr(col_r, 2)",
]


@mark.parametrize("sql_condition", non_exact_matchy_sql_conditions)
@mark_with_dialects_excluding()
def test_is_exact_match_for_non_exact_matchy_levels(sql_condition, dialect):
lev = make_comparison_level(sql_condition, dialect)
assert not lev._is_exact_match


@mark.parametrize("sql_condition", non_exact_matchy_sql_conditions)
@mark_with_dialects_excluding()
def test_exact_match_colnames_for_non_exact_matchy_levels(sql_condition, dialect):
lev = make_comparison_level(sql_condition, dialect)
# _exact_match_colnames should have an error if it is
# not actually an exact match level
with raises(ValueError):
lev._exact_match_colnames
51 changes: 0 additions & 51 deletions tests/test_compound_comparison_levels.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import pandas as pd
from sqlglot import parse_one
from sqlglot.optimizer.normalize import normalize

import splink.duckdb.comparison_level_library as cll
import splink.duckdb.comparison_library as cl
Expand Down Expand Up @@ -216,52 +214,3 @@ def test_complex_compound_comparison_level():
linker = DuckDBLinker(df, settings)

linker.estimate_parameters_using_expectation_maximisation("1=1")


def test_normalise():
# check that the sqlglot normaliser is doing what we think
# try to not impose specific form too strongly, so we aren't too tightly
# coupled to the implementationß
sql_syntax_tree = parse_one("a or (b and c)")
sql_cnf = normalize(sql_syntax_tree).sql().lower()

subclauses_expected = [
["a or c", "c or a"],
["a or b", "b or a"],
]

# get subclauses and remove outer parens
subclauses_found = map(lambda s: s.strip("()"), sql_cnf.split(" and "))

# loop through subclauses, make sure that we have exactly one of each
for found in subclauses_found:
term_found = False
for i, expected in enumerate(subclauses_expected):
if found in expected:
del subclauses_expected[i]
term_found = True
break
assert term_found, f"CNF contains unexpected clause '{found}'"
assert not subclauses_expected

# and a slightly more complex statement
sql_syntax_tree = parse_one("(a and b) or (a and c) or (c and d) or (d and b)")
sql_cnf = normalize(sql_syntax_tree).sql().lower()

subclauses_expected = [
["b or c", "c or b"],
["a or d", "d or a"],
]

subclauses_found = map(lambda s: s.strip("()"), sql_cnf.split(" and "))

# loop through subclauses, make sure that we have exactly one of each
for found in subclauses_found:
term_found = False
for i, expected in enumerate(subclauses_expected):
if found in expected:
del subclauses_expected[i]
term_found = True
break
assert term_found, f"CNF contains unexpected clause '{found}'"
assert not subclauses_expected
8 changes: 6 additions & 2 deletions tests/test_sql_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,16 @@ def test_move_l_r_table_prefix_to_column_suffix():
move_l_r_test(br, expected)

br = "len(list_filter(l.name_list, x -> list_contains(r.name_list, x))) >= 1"
expected = "len(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
expected = (
"length(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
)
move_l_r_test(br, expected)

br = "len(list_filter(l.name_list, x -> list_contains(r.name_list, x))) >= 1"
res = move_l_r_table_prefix_to_column_suffix(br)
expected = "len(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
expected = (
"length(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
)
assert res.lower() == expected.lower()


Expand Down

0 comments on commit 26d7148

Please sign in to comment.