Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add filter to rule #141

Merged
merged 10 commits into from
Feb 7, 2025
4 changes: 3 additions & 1 deletion src/databricks/labs/dqx/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,20 +136,22 @@ def build_checks_by_metadata(checks: list[dict], glbs: dict[str, Any] | None = N
assert func # should already be validated
func_args = check.get("arguments", {})
criticality = check_def.get("criticality", "error")
filter_expr = check_def.get("filter")

if "col_names" in func_args:
logger.debug(f"Adding DQRuleColSet with columns: {func_args['col_names']}")
dq_rule_checks += DQRuleColSet(
columns=func_args["col_names"],
check_func=func,
criticality=criticality,
filter=filter_expr,
# provide arguments without "col_names"
check_func_kwargs={k: func_args[k] for k in func_args.keys() - {"col_names"}},
).get_rules()
else:
name = check_def.get("name", None)
check_func = func(**func_args)
dq_rule_checks.append(DQRule(check=check_func, name=name, criticality=criticality))
dq_rule_checks.append(DQRule(check=check_func, name=name, criticality=criticality, filter=filter_expr))

logger.debug("Exiting build_checks_by_metadata function with dq_rule_checks")
return dq_rule_checks
Expand Down
8 changes: 7 additions & 1 deletion src/databricks/labs/dqx/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class DQRule:
check: Column
name: str = ""
criticality: str = Criticality.ERROR.value
filter: str | None = None

def __post_init__(self):
# take the name from the alias of the column expression if not provided
Expand All @@ -58,7 +59,10 @@ def check_column(self) -> Column:

:return: Column object
"""
return F.when(self.check.isNull(), F.lit(None).cast("string")).otherwise(self.check)
# if filter is provided, apply the filter to the check
filter_col = F.expr(self.filter) if self.filter else F.lit(True)

return F.when(self.check.isNotNull(), F.when(filter_col, self.check)).otherwise(F.lit(None).cast("string"))


@dataclass(frozen=True)
Expand All @@ -75,6 +79,7 @@ class DQRuleColSet:
columns: list[str]
check_func: Callable
criticality: str = Criticality.ERROR.value
filter: str | None = None
check_func_args: list[Any] = field(default_factory=list)
check_func_kwargs: dict[str, Any] = field(default_factory=dict)

Expand All @@ -88,6 +93,7 @@ def get_rules(self) -> list[DQRule]:
rule = DQRule(
criticality=self.criticality,
check=self.check_func(col_name, *self.check_func_args, **self.check_func_kwargs),
filter=self.filter,
)
rules.append(rule)
return rules
Expand Down
36 changes: 36 additions & 0 deletions tests/integration/test_apply_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,42 @@ def test_apply_checks_by_metadata(ws, spark):
assert_df_equality(checked, expected, ignore_nullable=True)


def test_apply_checks_with_filter(ws, spark):
dq_engine = DQEngine(ws)
test_df = spark.createDataFrame(
[[1, 3, 3], [2, None, 4], [3, 4, None], [4, None, None], [None, None, None]], SCHEMA
)

checks = [
DQRule(
name="col_b_is_null_or_empty",
criticality="error",
check=is_not_null_and_not_empty("b"),
filter="a<3",
),
DQRule(
name="col_c_is_null_or_empty",
criticality="error",
check=is_not_null_and_not_empty("c"),
),
]

checked = dq_engine.apply_checks(test_df, checks)

expected = spark.createDataFrame(
[
[1, 3, 3, None, None],
[2, None, 4, {"col_b_is_null_or_empty": "Column b is null or empty"}, None],
[3, 4, None, {"col_c_is_null_or_empty": "Column c is null or empty"}, None],
[4, None, None, {"col_c_is_null_or_empty": "Column c is null or empty"}, None],
[None, None, None, {"col_c_is_null_or_empty": "Column c is null or empty"}, None],
],
EXPECTED_SCHEMA,
)

assert_df_equality(checked, expected, ignore_nullable=True)


def test_apply_checks_from_json_file_by_metadata(ws, spark):
dq_engine = DQEngine(ws)
schema = "col1: int, col2: int, col3: int, col4 int"
Expand Down
Loading