From ff1094228e22a1d40e6bf19552130fd384944892 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Fri, 9 Aug 2024 16:52:58 -0700 Subject: [PATCH] Add expression string syntax for null boolean columns Add support for `= NULL` and `!= NULL` syntax for boolean columns in query where strings. --- .../daf/butler/queries/_expression_strings.py | 38 ++++++++++++++++++- .../daf/butler/queries/tree/_predicate.py | 9 +++-- .../lsst/daf/butler/tests/butler_queries.py | 12 ++++++ 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/python/lsst/daf/butler/queries/_expression_strings.py b/python/lsst/daf/butler/queries/_expression_strings.py index cc0cc755cf..6c84c95960 100644 --- a/python/lsst/daf/butler/queries/_expression_strings.py +++ b/python/lsst/daf/butler/queries/_expression_strings.py @@ -42,6 +42,7 @@ from .tree import ( BinaryExpression, ColumnExpression, + ColumnReference, ComparisonOperator, LiteralValue, Predicate, @@ -158,6 +159,16 @@ def visitBinaryOp( return Predicate.is_null(rhs.value) case ["!=", _Null(), _ColExpr() as rhs]: return Predicate.is_null(rhs.value).logical_not() + # Boolean columns can be null, but will have been converted to + # Predicate, so we need additional cases. + case ["=" | "!=", Predicate() as pred, _Null()] | ["=" | "!=", _Null(), Predicate() as pred]: + column_ref = _get_boolean_column_reference(pred) + if column_ref is not None: + match operator: + case "=": + return Predicate.is_null(column_ref) + case "!=": + return Predicate.is_null(column_ref).logical_not() # Handle arithmetic operations case [("+" | "-" | "*" | "/" | "%") as op, _ColExpr() as lhs, _ColExpr() as rhs]: @@ -202,7 +213,16 @@ def visitIdentifier(self, name: str, node: Node) -> _VisitorResult: if column_expression.column_type == "bool": # Expression-handling code (in this file and elsewhere) expects # boolean-valued expressions to be represented as Predicate, not a - # column expression. + # ColumnExpression. + + # We should only be getting direct references to a column, not a + # more complicated expression. + # (Anything more complicated should be a Predicate already.) + assert ( + column_expression.expression_type == "dataset_field" + or column_expression.expression_type == "dimension_field" + or column_expression.expression_type == "dimension_key" + ) return Predicate.from_bool_expression(column_expression) else: return _ColExpr(column_expression) @@ -310,3 +330,19 @@ def _convert_in_clause_to_predicate(lhs: ColumnExpression, rhs: _VisitorResult, return Predicate.is_null(lhs) case _: raise InvalidQueryError(f"Invalid IN expression: '{node!s}") + + +def _get_boolean_column_reference(predicate: Predicate) -> ColumnReference | None: + """Unwrap a predicate to recover the boolean ColumnReference it contains. + Returns `None` if this Predicate contains anything other than a single + boolean ColumnReference operand. + + This undoes the ColumnReference to Predicate conversion that occurs in + visitIdentifier for boolean columns. + """ + if len(predicate.operands) == 1 and len(predicate.operands[0]) == 1: + predicate_leaf = predicate.operands[0][0] + if predicate_leaf.predicate_type == "boolean_wrapper": + return predicate_leaf.operand + + return None diff --git a/python/lsst/daf/butler/queries/tree/_predicate.py b/python/lsst/daf/butler/queries/tree/_predicate.py index 8335c81e3a..15368b491e 100644 --- a/python/lsst/daf/butler/queries/tree/_predicate.py +++ b/python/lsst/daf/butler/queries/tree/_predicate.py @@ -46,6 +46,7 @@ from ._base import QueryTreeBase from ._column_expression import ( ColumnExpression, + ColumnReference, is_one_datetime_and_one_ingest_date, is_one_timespan_and_one_datetime, ) @@ -156,9 +157,9 @@ def from_bool(cls, value: bool) -> Predicate: return cls.model_construct(operands=() if value else ((),)) @classmethod - def from_bool_expression(cls, value: ColumnExpression) -> Predicate: - """Construct a predicate that wraps a boolean ColumnExpression, taking - on the value of the underlying ColumnExpression. + def from_bool_expression(cls, value: ColumnReference) -> Predicate: + """Construct a predicate that wraps a boolean ColumnReference, taking + on the value of the underlying ColumnReference. Parameters ---------- @@ -437,7 +438,7 @@ class BooleanWrapper(PredicateLeafBase): predicate_type: Literal["boolean_wrapper"] = "boolean_wrapper" - operand: ColumnExpression + operand: ColumnReference """Wrapped expression that will be used as the value for this predicate.""" def gather_required_columns(self, columns: ColumnSet) -> None: diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py index 11a7fa987d..9150d5f510 100644 --- a/python/lsst/daf/butler/tests/butler_queries.py +++ b/python/lsst/daf/butler/tests/butler_queries.py @@ -988,6 +988,18 @@ def _run_query(where: str) -> list[int]: query_func("exposure.can_see_sky OR exposure = 2001"), [TRUE_ID, FALSE_ID_1] ) + # Find nulls and non-nulls. This is run only against the new query + # system. It appears that the `= NULL` syntax never had test coverage + # in the old query system and doesn't appear to work for any column + # types, not just bool. Not worth fixing since we are dropping that + # code soon. + nulls = [NULL_ID_1, NULL_ID_2] + non_nulls = [TRUE_ID, FALSE_ID_1, FALSE_ID_2] + self.assertCountEqual(_run_query("exposure.can_see_sky = NULL"), nulls) + self.assertCountEqual(_run_query("exposure.can_see_sky != NULL"), non_nulls) + self.assertCountEqual(_run_query("NULL = exposure.can_see_sky"), nulls) + self.assertCountEqual(_run_query("NULL != exposure.can_see_sky"), non_nulls) + # Test boolean columns in ExpressionFactory. with butler._query() as query: x = query.expression_factory