ray-project · srinathk10 · Dec 17, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -1180,6 +1180,11 @@ def filter(
         stateful Ray actors. For more information, see
         :ref:`Stateful Transforms <stateful_transforms>`.
 
+        .. tip::
+           If you can represent your filter as an expression that leverages Arrow
+           Dataset Expression, we will be do highly optimized filtering using native
+           Arrow interfaces.
+
         .. tip::
             If you can represent your predicate with NumPy or pandas operations,
             :meth:`Dataset.map_batches` might be faster. You can implement filter by
@@ -1197,14 +1202,16 @@ def filter(
             >>> ds = ray.data.range(100)
             >>> ds.filter(lambda row: row["id"] % 2 == 0).take_all()
             [{'id': 0}, {'id': 2}, {'id': 4}, ...]
+            >> ds.filter(expr="id <= 4").take_all()
+            [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}, {'id': 4}]
 
         Time complexity: O(dataset size / parallelism)
 
         Args:
             fn: The predicate to apply to each row, or a class type
                 that can be instantiated to create such a callable.
-            expr: An expression string that will be
-                converted to pyarrow.dataset.Expression type.
+            expr: An expression string needs to be a valid python expression that
+                will be converted to pyarrow.dataset.Expression type.
             compute: This argument is deprecated. Use ``concurrency`` argument.
             concurrency: The number of Ray workers to use concurrently. For a
                 fixed-sized worker pool of size ``n``, specify ``concurrency=n``.
@@ -1237,6 +1244,10 @@ def filter(
 
             compute = TaskPoolStrategy(size=concurrency)
         else:
+            warnings.warn(
+                "Use expr instead of fn when possible for performant filters."
+            )
+
             if callable(fn):
                 compute = get_compute_strategy(
                     fn=fn,