From 539e9297aff5a187696545d808a6f65cd4c6cafe Mon Sep 17 00:00:00 2001
From: Tim Jenness <tjenness@lsst.org>
Date: Tue, 3 Sep 2024 21:43:07 -0700
Subject: [PATCH] Add some simple query interface tests

These are the advanced tests with the simple interface where
possible.
---
 python/lsst/daf/butler/_butler.py             |  27 ++-
 .../lsst/daf/butler/tests/butler_queries.py   | 204 ++++++++++++++++--
 2 files changed, 211 insertions(+), 20 deletions(-)

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
index 9798b857d6..166e2280ee 100644
--- a/python/lsst/daf/butler/_butler.py
+++ b/python/lsst/daf/butler/_butler.py
@@ -1549,6 +1549,8 @@ def query_data_ids(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
+        if order_by is None:
+            order_by = []
         with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
@@ -1573,6 +1575,8 @@ def query_datasets(
         where: str = "",
         bind: Mapping[str, Any] | None = None,
         with_dimension_records: bool = False,
+        order_by: Iterable[str] | str | None = None,
+        limit: int = 20_000,
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DatasetRef]:
@@ -1584,7 +1588,7 @@ def query_datasets(
             Dataset type object or name to search for.
         collections : collection expression, optional
             A collection name or iterable of collection names to search. If not
-            provided, the default collections are used.  See
+            provided, the default collections are used. Can be a wildcard. See
             :ref:`daf_butler_collection_expressions` for more information.
         find_first : `bool`, optional
             If `True` (default), for each result data ID, only yield one
@@ -1609,6 +1613,12 @@ def query_datasets(
         with_dimension_records : `bool`, optional
             If `True` (default is `False`) then returned data IDs will have
             dimension records.
+        order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
+            Names of the columns/dimensions to use for ordering returned data
+            IDs. Column name can be prefixed with minus (``-``) to use
+            descending ordering.
+        limit : `int`, optional
+            Upper limit on the number of returned records.
         explain : `bool`, optional
             If `True` (default) then `EmptyQueryResultError` exception is
             raised when resulting list is empty. The exception contains
@@ -1654,11 +1664,16 @@ def query_datasets(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
+        if order_by is None:
+            order_by = []
+        if collections:
+            collections = self.collections.query(collections)
         with self.query() as query:
-            result = query.where(data_id, where, bind=bind, **kwargs).datasets(
-                dataset_type,
-                collections=collections,
-                find_first=find_first,
+            result = (
+                query.where(data_id, where, bind=bind, **kwargs)
+                .datasets(dataset_type, collections=collections, find_first=find_first)
+                .order_by(*ensure_iterable(order_by))
+                .limit(limit)
             )
             if with_dimension_records:
                 result = result.with_dimension_records()
@@ -1738,6 +1753,8 @@ def query_dimension_records(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
+        if order_by is None:
+            order_by = []
         with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py
index 8442b84e39..f893e64827 100644
--- a/python/lsst/daf/butler/tests/butler_queries.py
+++ b/python/lsst/daf/butler/tests/butler_queries.py
@@ -44,7 +44,7 @@
 from .._butler import Butler
 from .._collection_type import CollectionType
 from .._dataset_type import DatasetType
-from .._exceptions import InvalidQueryError
+from .._exceptions import EmptyQueryResultError, InvalidQueryError
 from .._timespan import Timespan
 from ..dimensions import DataCoordinate, DimensionGroup, DimensionRecord
 from ..direct_query_driver import DirectQueryDriver
@@ -153,6 +153,14 @@ def check_detector_records(
             self.assertFalse(results.any(exact=False, execute=False))
             self.assertFalse(results.any(exact=True, execute=False))
         self.assertCountEqual(results.explain_no_results(), list(messages))
+        self.check_detector_records_returned(list(results), ids=ids, ordered=ordered)
+
+    def check_detector_records_returned(
+        self,
+        results: list[DimensionRecord],
+        ids: Sequence[int] = (1, 2, 3, 4),
+        ordered: bool = False,
+    ) -> None:
         expected = [DETECTOR_TUPLES[i] for i in ids]
         queried = list(make_detector_tuples(results).values())
         if ordered:
@@ -173,33 +181,64 @@ def test_simple_record_query(self) -> None:
             _x = query.expression_factory
             results = query.dimension_records("detector")
             self.check_detector_records(results)
+            self.check_detector_records_returned(butler.query_dimension_records("detector"))
             self.check_detector_records(results.order_by("detector"), ordered=True)
+            self.check_detector_records_returned(
+                butler.query_dimension_records("detector", order_by="detector"), ordered=True
+            )
             self.check_detector_records(
                 results.order_by(_x.detector.full_name.desc), [4, 3, 2, 1], ordered=True
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records("detector", order_by="-full_name"),
+                ids=[4, 3, 2, 1],
+                ordered=True,
+            )
             self.check_detector_records(results.order_by("detector").limit(2), [1, 2], ordered=True)
+            self.check_detector_records_returned(
+                butler.query_dimension_records("detector", limit=2, order_by="detector"),
+                ids=[1, 2],
+                ordered=True,
+            )
             self.check_detector_records(results.where(_x.detector.raft == "B", instrument="Cam1"), [3, 4])
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector", where="detector.raft = R", bind={"R": "B"}, instrument="Cam1"
+                ),
+                ids=[3, 4],
+            )
 
     def test_simple_data_coordinate_query(self) -> None:
         butler = self.make_butler("base.yaml")
+
+        expected_detectors = [1, 2, 3, 4]
+        universe = butler.dimensions
+        expected_coordinates = [
+            DataCoordinate.standardize({"instrument": "Cam1", "detector": x}, universe=universe)
+            for x in expected_detectors
+        ]
+
         with butler.query() as query:
             # Test empty query
-            self.assertCountEqual(query.data_ids([]), [DataCoordinate.makeEmpty(butler.dimensions)])
+            empty = DataCoordinate.make_empty(butler.dimensions)
+            self.assertCountEqual(list(query.data_ids([])), [empty])
+            self.assertCountEqual(butler.query_data_ids([]), [empty])
 
             # Test query for a single dimension
             results = query.data_ids(["detector"])
-            expected_detectors = [1, 2, 3, 4]
-            universe = butler.dimensions
-            expected_coordinates = [
-                DataCoordinate.standardize({"instrument": "Cam1", "detector": x}, universe=universe)
-                for x in expected_detectors
-            ]
             self.assertCountEqual(list(results), expected_coordinates)
 
+        data_ids = butler.query_data_ids("detector")
+        self.assertCountEqual(data_ids, expected_coordinates)
+
     def test_simple_dataset_query(self) -> None:
         butler = self.make_butler("base.yaml", "datasets.yaml")
         with butler.query() as query:
-            refs = list(query.datasets("bias", "imported_g").order_by("detector"))
+            refs_q = list(query.datasets("bias", "imported_g").order_by("detector"))
+        refs_simple = butler.query_datasets("bias", "imported_g", order_by="detector")
+        self.assertCountEqual(refs_q, refs_simple)
+
+        for refs in (refs_q, refs_simple):
             self.assertEqual(len(refs), 3)
             self.assertEqual(refs[0].id, UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22"))
             self.assertEqual(refs[1].id, UUID("51352db4-a47a-447c-b12d-a50b206b17cd"))
@@ -209,6 +248,19 @@ def test_simple_dataset_query(self) -> None:
                 self.assertEqual(ref.dataId["detector"], detector)
                 self.assertEqual(ref.run, "imported_g")
 
+        # Try again with limit.
+        with butler.query() as query:
+            refs_q = list(query.datasets("bias", "imported_g").order_by("detector").limit(2))
+        refs_simple = butler.query_datasets("bias", "imported_g", order_by="detector", limit=2)
+        self.assertCountEqual(refs_q, refs_simple)
+        self.assertEqual(len(refs_q), 2)
+        self.assertEqual(refs_q[0].id, UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22"))
+        self.assertEqual(refs_q[1].id, UUID("51352db4-a47a-447c-b12d-a50b206b17cd"))
+
+        with self.assertRaises(EmptyQueryResultError) as cm:
+            butler.query_datasets("bias", "*", detector=100, instrument="Unknown")
+        self.assertIn("doomed", str(cm.exception))
+
     def test_general_query(self) -> None:
         """Test Query.general and its result."""
         butler = self.make_butler("base.yaml", "datasets.yaml")
@@ -425,6 +477,14 @@ def test_implied_union_record_query(self) -> None:
                 list(query.where(physical_filter="Cam1-R1", instrument="Cam1").dimension_records("band")),
                 [band.RecordClass(name="r")],
             )
+        self.assertCountEqual(
+            butler.query_dimension_records("band"),
+            [band.RecordClass(name="g"), band.RecordClass(name="r")],
+        )
+        self.assertCountEqual(
+            butler.query_dimension_records("band", physical_filter="Cam1-R1", instrument="Cam1"),
+            [band.RecordClass(name="r")],
+        )
 
     def test_dataset_constrained_record_query(self) -> None:
         """Test a query for dimension records constrained by the existence of
@@ -531,6 +591,15 @@ def test_spatial_overlaps(self) -> None:
                 [1, 3, 4],
                 has_postprocessing=True,
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector",
+                    where="visit_detector_region.region OVERLAPS region",
+                    bind={"region": htm7.pixelization.pixel(253954)},
+                    visit=1,
+                ),
+                ids=[1, 3, 4],
+            )
             # Query for detectors from a particular visit that overlap an htm7
             # ID.  This is basically the same query as the last one, but
             # expressed as a spatial join, and we can recognize that
@@ -556,6 +625,14 @@ def test_spatial_overlaps(self) -> None:
                 [1, 3, 4],
                 has_postprocessing=False,
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector",
+                    visit=1,
+                    htm7=253954,
+                ),
+                ids=[1, 3, 4],
+            )
             # Query for the detectors from any visit that overlap a region:
             # this gets contributions from multiple visits, and would have
             # duplicates if we didn't get rid of them via GROUP BY.
@@ -566,6 +643,14 @@ def test_spatial_overlaps(self) -> None:
                 [1, 2, 3, 4],
                 has_postprocessing=True,
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector",
+                    where="visit_detector_region.region OVERLAPS region",
+                    bind={"region": htm7.pixelization.pixel(253954)},
+                ),
+                ids=[1, 2, 3, 4],
+            )
             # Once again we rewrite the region-constraint query as a spatial
             # join, which drops the postprocessing.  This join has to be
             # explicit because `visit` no longer gets into the query dimensions
@@ -604,6 +689,14 @@ def test_spatial_overlaps(self) -> None:
                 [1, 2, 3],
                 has_postprocessing=True,
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector",
+                    where="visit_detector_region.region OVERLAPS region",
+                    bind={"region": patch_record.region},
+                ),
+                ids=[1, 2, 3],
+            )
             # Combine postprocessing with order_by and limit.
             self.check_detector_records(
                 query.where(
@@ -615,6 +708,16 @@ def test_spatial_overlaps(self) -> None:
                 [3, 2],
                 has_postprocessing=True,
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector",
+                    where="visit_detector_region.region OVERLAPS region",
+                    bind={"region": patch_record.region},
+                    order_by="-detector",
+                    limit=2,
+                ),
+                ids=[3, 2],
+            )
             # Try a case where there are some records before postprocessing but
             # none afterwards.
             self.check_detector_records(
@@ -625,7 +728,16 @@ def test_spatial_overlaps(self) -> None:
                 [],
                 has_postprocessing=True,
             )
-
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector",
+                    where="visit_detector_region.region OVERLAPS region",
+                    bind={"region": patch_record.region},
+                    detector=4,
+                    explain=False,
+                ),
+                ids=[],
+            )
             # Check spatial queries using points instead of regions.
             # This (ra, dec) is a point in the center of the region for visit
             # 1, detector 3.
@@ -744,6 +856,13 @@ def test_common_skypix_overlaps(self) -> None:
                 ],
                 [253954, 253955],
             )
+            self.assertCountEqual(
+                [
+                    record.id
+                    for record in butler.query_dimension_records("htm7", skymap="SkyMap1", tract=0, patch=4)
+                ],
+                [253954, 253955],
+            )
             # Constraint on the patch region (with the query not knowing it
             # corresponds to that patch).
             (patch,) = query.where(skymap="SkyMap1", tract=0, patch=4).dimension_records("patch")
@@ -774,6 +893,13 @@ def test_spatial_constraint_queries(self) -> None:
                     for data_id in query.data_ids(["patch"]).where({"instrument": "HSC", "visit": 318})
                 ],
             )
+            self.assertEqual(
+                [(9813, 72)],
+                [
+                    (data_id["tract"], data_id["patch"])
+                    for data_id in butler.query_data_ids(["patch"], instrument="HSC", visit=318)
+                ],
+            )
 
             # This tests the case where the 'patch' region is needed in
             # postprocessing AND is also returned in the result rows.
@@ -789,6 +915,13 @@ def test_spatial_constraint_queries(self) -> None:
                     for record in query.dimension_records("patch").where({"instrument": "HSC", "visit": 318})
                 ],
             )
+            self.assertEqual(
+                [(9813, 72, region_hex)],
+                [
+                    (record.tract, record.id, record.region.encode().hex())
+                    for record in butler.query_dimension_records("patch", instrument="HSC", visit=318)
+                ],
+            )
 
     def test_data_coordinate_upload(self) -> None:
         """Test queries for dimension records with a data coordinate upload."""
@@ -1005,11 +1138,11 @@ def test_timespan_results(self) -> None:
         """Test returning dimension records that include timespans."""
         butler = self.make_butler("base.yaml", "spatial.yaml")
         with butler.query() as query:
+            query_results = list(query.dimension_records("visit"))
+        simple_results = butler.query_dimension_records("visit")
+        for results in (query_results, simple_results):
             self.assertCountEqual(
-                [
-                    (record.id, record.timespan.begin, record.timespan.end)
-                    for record in query.dimension_records("visit")
-                ],
+                [(record.id, record.timespan.begin, record.timespan.end) for record in results],
                 [
                     (
                         1,
@@ -1060,6 +1193,10 @@ def test_column_expressions(self) -> None:
                 query.where(_x.not_(_x.detector != 2)).dimension_records("detector"),
                 [2],
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records("detector", where="NOT (detector != 2)"),
+                [2],
+            )
             self.check_detector_records(
                 # Empty string expression should evaluate to True.
                 query.where(_x.detector == 2, "").dimension_records("detector"),
@@ -1121,6 +1258,18 @@ def test_column_expressions(self) -> None:
                 ],
                 [2],
             )
+            self.assertCountEqual(
+                [
+                    record.id
+                    for record in butler.query_dimension_records(
+                        # In the middle of the timespan.
+                        "visit",
+                        where="visit.timespan OVERLAPS(ts)",
+                        bind={"ts": astropy.time.Time("2021-09-09T03:02:30", format="isot", scale="tai")},
+                    )
+                ],
+                [2],
+            )
             self.assertCountEqual(
                 [
                     record.id
@@ -1199,6 +1348,12 @@ def test_column_expressions(self) -> None:
                 query.where(_x.detector.in_iterable([1, 3, 4])).dimension_records("detector"),
                 [1, 3, 4],
             )
+            self.check_detector_records_returned(
+                butler.query_dimension_records(
+                    "detector", where="detector IN (det)", bind={"det": [1, 3, 4]}
+                ),
+                [1, 3, 4],
+            )
             self.check_detector_records(
                 query.where(_x.detector.in_range(start=2, stop=None)).dimension_records("detector"),
                 [2, 3, 4],
@@ -1259,6 +1414,11 @@ def _run_registry_query(where: str) -> list[int]:
                 butler.registry.queryDimensionRecords("exposure", where=where, instrument="HSC")
             )
 
+        def _run_simple_query(where: str) -> list[int]:
+            return _get_exposure_ids_from_dimension_records(
+                butler.query_dimension_records("exposure", where=where, instrument="HSC")
+            )
+
         def _run_query(where: str) -> list[int]:
             with butler.query() as query:
                 return _get_exposure_ids_from_dimension_records(
@@ -1266,7 +1426,11 @@ def _run_query(where: str) -> list[int]:
                 )
 
         # Test boolean columns in the `where` string syntax.
-        for test, query_func in [("registry", _run_registry_query), ("new-query", _run_query)]:
+        for test, query_func in [
+            ("registry", _run_registry_query),
+            ("new-query", _run_query),
+            ("simple", _run_simple_query),
+        ]:
             with self.subTest(test):
                 # Boolean columns should be usable standalone as an expression.
                 self.assertCountEqual(query_func("exposure.can_see_sky"), [TRUE_ID])
@@ -1392,6 +1556,16 @@ def test_dataset_region_queries(self) -> None:
                 refs = list(results)
                 self.assertEqual(len(refs), count, f"POS={pos} REFS={refs}")
 
+                simple_refs = butler.query_datasets(
+                    "calexp",
+                    collections=run,
+                    instrument="HSC",
+                    where="visit_detector_region.region OVERLAPS(POS)",
+                    bind={"POS": Region.from_ivoa_pos(pos)},
+                    explain=False,
+                )
+                self.assertCountEqual(refs, simple_refs)
+
     def test_dataset_time_queries(self) -> None:
         """Test region queries for datasets."""
         # Import data to play with.