From ff1094228e22a1d40e6bf19552130fd384944892 Mon Sep 17 00:00:00 2001
From: "David H. Irving" <david.irving@noirlab.edu>
Date: Fri, 9 Aug 2024 16:52:58 -0700
Subject: [PATCH] Add expression string syntax for null boolean columns

Add support for `= NULL` and `!= NULL` syntax for boolean columns in query where strings.
---
 .../daf/butler/queries/_expression_strings.py | 38 ++++++++++++++++++-
 .../daf/butler/queries/tree/_predicate.py     |  9 +++--
 .../lsst/daf/butler/tests/butler_queries.py   | 12 ++++++
 3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/python/lsst/daf/butler/queries/_expression_strings.py b/python/lsst/daf/butler/queries/_expression_strings.py
index cc0cc755cf..6c84c95960 100644
--- a/python/lsst/daf/butler/queries/_expression_strings.py
+++ b/python/lsst/daf/butler/queries/_expression_strings.py
@@ -42,6 +42,7 @@
 from .tree import (
     BinaryExpression,
     ColumnExpression,
+    ColumnReference,
     ComparisonOperator,
     LiteralValue,
     Predicate,
@@ -158,6 +159,16 @@ def visitBinaryOp(
                 return Predicate.is_null(rhs.value)
             case ["!=", _Null(), _ColExpr() as rhs]:
                 return Predicate.is_null(rhs.value).logical_not()
+            # Boolean columns can be null, but will have been converted to
+            # Predicate, so we need additional cases.
+            case ["=" | "!=", Predicate() as pred, _Null()] | ["=" | "!=", _Null(), Predicate() as pred]:
+                column_ref = _get_boolean_column_reference(pred)
+                if column_ref is not None:
+                    match operator:
+                        case "=":
+                            return Predicate.is_null(column_ref)
+                        case "!=":
+                            return Predicate.is_null(column_ref).logical_not()
 
             # Handle arithmetic operations
             case [("+" | "-" | "*" | "/" | "%") as op, _ColExpr() as lhs, _ColExpr() as rhs]:
@@ -202,7 +213,16 @@ def visitIdentifier(self, name: str, node: Node) -> _VisitorResult:
         if column_expression.column_type == "bool":
             # Expression-handling code (in this file and elsewhere) expects
             # boolean-valued expressions to be represented as Predicate, not a
-            # column expression.
+            # ColumnExpression.
+
+            # We should only be getting direct references to a column, not a
+            # more complicated expression.
+            # (Anything more complicated should be a Predicate already.)
+            assert (
+                column_expression.expression_type == "dataset_field"
+                or column_expression.expression_type == "dimension_field"
+                or column_expression.expression_type == "dimension_key"
+            )
             return Predicate.from_bool_expression(column_expression)
         else:
             return _ColExpr(column_expression)
@@ -310,3 +330,19 @@ def _convert_in_clause_to_predicate(lhs: ColumnExpression, rhs: _VisitorResult,
             return Predicate.is_null(lhs)
         case _:
             raise InvalidQueryError(f"Invalid IN expression: '{node!s}")
+
+
+def _get_boolean_column_reference(predicate: Predicate) -> ColumnReference | None:
+    """Unwrap a predicate to recover the boolean ColumnReference it contains.
+    Returns `None` if this Predicate contains anything other than a single
+    boolean ColumnReference operand.
+
+    This undoes the ColumnReference to Predicate conversion that occurs in
+    visitIdentifier for boolean columns.
+    """
+    if len(predicate.operands) == 1 and len(predicate.operands[0]) == 1:
+        predicate_leaf = predicate.operands[0][0]
+        if predicate_leaf.predicate_type == "boolean_wrapper":
+            return predicate_leaf.operand
+
+    return None
diff --git a/python/lsst/daf/butler/queries/tree/_predicate.py b/python/lsst/daf/butler/queries/tree/_predicate.py
index 8335c81e3a..15368b491e 100644
--- a/python/lsst/daf/butler/queries/tree/_predicate.py
+++ b/python/lsst/daf/butler/queries/tree/_predicate.py
@@ -46,6 +46,7 @@
 from ._base import QueryTreeBase
 from ._column_expression import (
     ColumnExpression,
+    ColumnReference,
     is_one_datetime_and_one_ingest_date,
     is_one_timespan_and_one_datetime,
 )
@@ -156,9 +157,9 @@ def from_bool(cls, value: bool) -> Predicate:
         return cls.model_construct(operands=() if value else ((),))
 
     @classmethod
-    def from_bool_expression(cls, value: ColumnExpression) -> Predicate:
-        """Construct a predicate that wraps a boolean ColumnExpression, taking
-        on the value of the underlying ColumnExpression.
+    def from_bool_expression(cls, value: ColumnReference) -> Predicate:
+        """Construct a predicate that wraps a boolean ColumnReference, taking
+        on the value of the underlying ColumnReference.
 
         Parameters
         ----------
@@ -437,7 +438,7 @@ class BooleanWrapper(PredicateLeafBase):
 
     predicate_type: Literal["boolean_wrapper"] = "boolean_wrapper"
 
-    operand: ColumnExpression
+    operand: ColumnReference
     """Wrapped expression that will be used as the value for this predicate."""
 
     def gather_required_columns(self, columns: ColumnSet) -> None:
diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py
index 11a7fa987d..9150d5f510 100644
--- a/python/lsst/daf/butler/tests/butler_queries.py
+++ b/python/lsst/daf/butler/tests/butler_queries.py
@@ -988,6 +988,18 @@ def _run_query(where: str) -> list[int]:
                     query_func("exposure.can_see_sky OR exposure = 2001"), [TRUE_ID, FALSE_ID_1]
                 )
 
+        # Find nulls and non-nulls.  This is run only against the new query
+        # system.  It appears that the `= NULL` syntax never had test coverage
+        # in the old query system and doesn't appear to work for any column
+        # types, not just bool.  Not worth fixing since we are dropping that
+        # code soon.
+        nulls = [NULL_ID_1, NULL_ID_2]
+        non_nulls = [TRUE_ID, FALSE_ID_1, FALSE_ID_2]
+        self.assertCountEqual(_run_query("exposure.can_see_sky = NULL"), nulls)
+        self.assertCountEqual(_run_query("exposure.can_see_sky != NULL"), non_nulls)
+        self.assertCountEqual(_run_query("NULL = exposure.can_see_sky"), nulls)
+        self.assertCountEqual(_run_query("NULL != exposure.can_see_sky"), non_nulls)
+
         # Test boolean columns in ExpressionFactory.
         with butler._query() as query:
             x = query.expression_factory