databrickslabs · mwojtyczka · Feb 7, 2025 · Jan 30, 2025 · Jan 31, 2025 · Jan 31, 2025
@@ -136,20 +136,22 @@ def build_checks_by_metadata(checks: list[dict], glbs: dict[str, Any] | None = N
             assert func  # should already be validated
             func_args = check.get("arguments", {})
             criticality = check_def.get("criticality", "error")
+            filter_expr = check_def.get("filter")
 
             if "col_names" in func_args:
                 logger.debug(f"Adding DQRuleColSet with columns: {func_args['col_names']}")
                 dq_rule_checks += DQRuleColSet(
                     columns=func_args["col_names"],
                     check_func=func,
                     criticality=criticality,
+                    filter=filter_expr,
                     # provide arguments without "col_names"
                     check_func_kwargs={k: func_args[k] for k in func_args.keys() - {"col_names"}},
                 ).get_rules()
             else:
                 name = check_def.get("name", None)
                 check_func = func(**func_args)
-                dq_rule_checks.append(DQRule(check=check_func, name=name, criticality=criticality))
+                dq_rule_checks.append(DQRule(check=check_func, name=name, criticality=criticality, filter=filter_expr))
 
         logger.debug("Exiting build_checks_by_metadata function with dq_rule_checks")
         return dq_rule_checks

@@ -35,6 +35,7 @@ class DQRule:
     check: Column
     name: str = ""
     criticality: str = Criticality.ERROR.value
+    filter: str | None = None
 
     def __post_init__(self):
         # take the name from the alias of the column expression if not provided
@@ -58,7 +59,10 @@ def check_column(self) -> Column:
 
         :return: Column object
         """
-        return F.when(self.check.isNull(), F.lit(None).cast("string")).otherwise(self.check)
+        # if filter is provided, apply the filter to the check
+        filter_col = F.expr(self.filter) if self.filter else F.lit(True)
+
+        return F.when(self.check.isNotNull(), F.when(filter_col, self.check)).otherwise(F.lit(None).cast("string"))
 
 
 @dataclass(frozen=True)
@@ -75,6 +79,7 @@ class DQRuleColSet:
     columns: list[str]
     check_func: Callable
     criticality: str = Criticality.ERROR.value
+    filter: str | None = None
     check_func_args: list[Any] = field(default_factory=list)
     check_func_kwargs: dict[str, Any] = field(default_factory=dict)
 
@@ -88,6 +93,7 @@ def get_rules(self) -> list[DQRule]:
             rule = DQRule(
                 criticality=self.criticality,
                 check=self.check_func(col_name, *self.check_func_args, **self.check_func_kwargs),
+                filter=self.filter,
             )
             rules.append(rule)
         return rules

@@ -380,6 +380,42 @@ def test_apply_checks_by_metadata(ws, spark):
     assert_df_equality(checked, expected, ignore_nullable=True)
 
 
+def test_apply_checks_with_filter(ws, spark):
+    dq_engine = DQEngine(ws)
+    test_df = spark.createDataFrame(
+        [[1, 3, 3], [2, None, 4], [3, 4, None], [4, None, None], [None, None, None]], SCHEMA
+    )
+
+    checks = [
+        DQRule(
+            name="col_b_is_null_or_empty",
+            criticality="error",
+            check=is_not_null_and_not_empty("b"),
+            filter="a<3",
+        ),
+        DQRule(
+            name="col_c_is_null_or_empty",
+            criticality="error",
+            check=is_not_null_and_not_empty("c"),
+        ),
+    ]
+
+    checked = dq_engine.apply_checks(test_df, checks)
+
+    expected = spark.createDataFrame(
+        [
+            [1, 3, 3, None, None],
+            [2, None, 4, {"col_b_is_null_or_empty": "Column b is null or empty"}, None],
+            [3, 4, None, {"col_c_is_null_or_empty": "Column c is null or empty"}, None],
+            [4, None, None, {"col_c_is_null_or_empty": "Column c is null or empty"}, None],
+            [None, None, None, {"col_c_is_null_or_empty": "Column c is null or empty"}, None],
+        ],
+        EXPECTED_SCHEMA,
+    )
+
+    assert_df_equality(checked, expected, ignore_nullable=True)
+
+
 def test_apply_checks_from_json_file_by_metadata(ws, spark):
     dq_engine = DQEngine(ws)
     schema = "col1: int, col2: int, col3: int, col4 int"