feat(python): Issue warning when using to_struct() without a list o…

…f field names (#20158)
pola-rs · Dec 5, 2024 · 588a22b · 588a22b
1 parent 300340a
commit 588a22b
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 26 deletions.
diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
 
 import copy
+import warnings
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any, Callable
 
 import polars._reexport as pl
 from polars import functions as F
 from polars._utils.parse import parse_into_expression
+from polars._utils.various import find_stacklevel
 from polars._utils.wrap import wrap_expr
 
 if TYPE_CHECKING:
@@ -1095,6 +1097,8 @@ def to_struct(
         n_field_strategy: ListToStructWidthStrategy = "first_non_null",
         fields: Sequence[str] | Callable[[int], str] | None = None,
         upper_bound: int = 0,
+        *,
+        _eager: bool = False,
     ) -> Expr:
         """
         Convert the Series of type `List` to a Series of type `Struct`.
@@ -1140,9 +1144,7 @@ def to_struct(
         Convert list to struct with default field name assignment:
 
         >>> df = pl.DataFrame({"n": [[0, 1], [0, 1, 2]]})
-        >>> df.with_columns(
-        ...     struct=pl.col("n").list.to_struct()
-        ... )  # doctest: +IGNORE_RESULT
+        >>> df.with_columns(struct=pl.col("n").list.to_struct())  # doctest: +SKIP
         shape: (2, 2)
         ┌───────────┬───────────┐
         │ n         ┆ struct    │
@@ -1158,7 +1160,7 @@ def to_struct(
 
         >>> df.with_columns(
         ...     struct=pl.col("n").list.to_struct(n_field_strategy="max_width")
-        ... )  # doctest: +IGNORE_RESULT
+        ... )  # doctest: +SKIP
         shape: (2, 2)
         ┌───────────┬────────────┐
         │ n         ┆ struct     │
@@ -1174,7 +1176,7 @@ def to_struct(
         >>> df = pl.DataFrame({"n": [[0, 1], [2, 3]]})
         >>> df.select(pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}")).rows(
         ...     named=True
-        ... )
+        ... )  # doctest: +SKIP
         [{'n': {'n0': 0, 'n1': 1}}, {'n': {'n0': 2, 'n1': 3}}]
 
         Convert list to struct with field name assignment by index from a list of names:
@@ -1188,6 +1190,13 @@ def to_struct(
             pyexpr = self._pyexpr.list_to_struct_fixed_width(fields)
             return wrap_expr(pyexpr)
         else:
+            if not _eager:
+                msg = (
+                    "`to_struct()` should be passed a list of field names to avoid "
+                    "query errors in subsequent operations (e.g. <struct operation> "
+                    "not supported for dtype Unknown)"
+                )
+                warnings.warn(msg, stacklevel=find_stacklevel())
             pyexpr = self._pyexpr.list_to_struct(n_field_strategy, fields, upper_bound)
             return wrap_expr(pyexpr)
 

diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py
@@ -920,6 +920,7 @@ def to_struct(
                     n_field_strategy,
                     fields,
                     upper_bound=0,
+                    _eager=True,
                 )
             )
             .to_series()

diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py
@@ -264,32 +264,30 @@ def test_from_dicts_struct() -> None:
 @pytest.mark.may_fail_auto_streaming
 def test_list_to_struct() -> None:
     df = pl.DataFrame({"a": [[1, 2, 3], [1, 2]]})
-    assert df.select([pl.col("a").list.to_struct()]).to_series().to_list() == [
+    assert df.to_series().list.to_struct().to_list() == [
         {"field_0": 1, "field_1": 2, "field_2": 3},
         {"field_0": 1, "field_1": 2, "field_2": None},
     ]
 
     df = pl.DataFrame({"a": [[1, 2], [1, 2, 3]]})
-    assert df.select(
-        pl.col("a").list.to_struct(fields=lambda idx: f"col_name_{idx}")
-    ).to_series().to_list() == [
+    assert df.to_series().list.to_struct(
+        fields=lambda idx: f"col_name_{idx}"
+    ).to_list() == [
         {"col_name_0": 1, "col_name_1": 2},
         {"col_name_0": 1, "col_name_1": 2},
     ]
 
     df = pl.DataFrame({"a": [[1, 2], [1, 2, 3]]})
-    assert df.select(
-        pl.col("a").list.to_struct(n_field_strategy="max_width")
-    ).to_series().to_list() == [
+    assert df.to_series().list.to_struct(n_field_strategy="max_width").to_list() == [
         {"field_0": 1, "field_1": 2, "field_2": None},
         {"field_0": 1, "field_1": 2, "field_2": 3},
     ]
 
     # set upper bound
     df = pl.DataFrame({"lists": [[1, 1, 1], [0, 1, 0], [1, 0, 0]]})
-    assert df.lazy().select(pl.col("lists").list.to_struct(upper_bound=3)).unnest(
-        "lists"
-    ).sum().collect().columns == ["field_0", "field_1", "field_2"]
+    assert df.lazy().select(
+        pl.col("lists").list.to_struct(upper_bound=3, _eager=True)
+    ).unnest("lists").sum().collect().columns == ["field_0", "field_1", "field_2"]
 
 
 def test_sort_df_with_list_struct() -> None:
@@ -1145,7 +1143,7 @@ def test_list_to_struct_19208() -> None:
         }
     )
     assert pl.concat([df[0], df[1], df[2]]).select(
-        pl.col("nested").list.to_struct()
+        pl.col("nested").list.to_struct(_eager=True)
     ).to_dict(as_series=False) == {
         "nested": [{"field_0": {"a": 1}}, {"field_0": None}, {"field_0": {"a": 3}}]
     }

diff --git a/py-polars/tests/unit/operations/namespaces/list/test_list.py b/py-polars/tests/unit/operations/namespaces/list/test_list.py
@@ -638,14 +638,14 @@ def test_list_unique2() -> None:
 def test_list_to_struct() -> None:
     df = pl.DataFrame({"n": [[0, 1, 2], [0, 1]]})
 
-    assert df.select(pl.col("n").list.to_struct()).rows(named=True) == [
+    assert df.select(pl.col("n").list.to_struct(_eager=True)).rows(named=True) == [
         {"n": {"field_0": 0, "field_1": 1, "field_2": 2}},
         {"n": {"field_0": 0, "field_1": 1, "field_2": None}},
     ]
 
-    assert df.select(pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}")).rows(
-        named=True
-    ) == [
+    assert df.select(
+        pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}", _eager=True)
+    ).rows(named=True) == [
         {"n": {"n0": 0, "n1": 1, "n2": 2}},
         {"n": {"n0": 0, "n1": 1, "n2": None}},
     ]
@@ -668,14 +668,16 @@ def test_list_to_struct() -> None:
     #   retrieve the lazy schema
     # * The upper bound is respected during execution
     q = df.lazy().select(
-        pl.col("n").list.to_struct(fields=str, upper_bound=2).struct.unnest()
+        pl.col("n")
+        .list.to_struct(fields=str, upper_bound=2, _eager=True)
+        .struct.unnest()
     )
     assert q.collect_schema() == {"0": pl.Int64, "1": pl.Int64}
     assert_frame_equal(q.collect(), pl.DataFrame({"0": [0, 0], "1": [1, 1]}))
 
-    assert df.lazy().select(pl.col("n").list.to_struct()).collect_schema() == {
-        "n": pl.Unknown
-    }
+    assert df.lazy().select(
+        pl.col("n").list.to_struct(_eager=True)
+    ).collect_schema() == {"n": pl.Unknown}
 
 
 def test_select_from_list_to_struct_11143() -> None:

diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py
@@ -696,7 +696,7 @@ def test_no_panic_pandas_nat() -> None:
 
 def test_list_to_struct_invalid_type() -> None:
     with pytest.raises(pl.exceptions.InvalidOperationError):
-        pl.DataFrame({"a": 1}).select(pl.col("a").list.to_struct())
+        pl.DataFrame({"a": 1}).to_series().list.to_struct()
 
 
 def test_raise_invalid_agg() -> None:

diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py
@@ -139,7 +139,9 @@ def test_unnest_columns_available() -> None:
     q = df.with_columns(
         pl.col("genres")
         .str.split("|")
-        .list.to_struct(n_field_strategy="max_width", fields=lambda i: f"genre{i + 1}")
+        .list.to_struct(
+            n_field_strategy="max_width", fields=lambda i: f"genre{i + 1}", _eager=True
+        )
     ).unnest("genres")
 
     out = q.collect()