Skip to content

Commit

Permalink
feat(python): Issue warning when using to_struct() without a list o…
Browse files Browse the repository at this point in the history
…f field names (#20158)
  • Loading branch information
nameexhaustion authored Dec 5, 2024
1 parent 300340a commit 588a22b
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 26 deletions.
19 changes: 14 additions & 5 deletions py-polars/polars/expr/list.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

import copy
import warnings
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, Callable

import polars._reexport as pl
from polars import functions as F
from polars._utils.parse import parse_into_expression
from polars._utils.various import find_stacklevel
from polars._utils.wrap import wrap_expr

if TYPE_CHECKING:
Expand Down Expand Up @@ -1095,6 +1097,8 @@ def to_struct(
n_field_strategy: ListToStructWidthStrategy = "first_non_null",
fields: Sequence[str] | Callable[[int], str] | None = None,
upper_bound: int = 0,
*,
_eager: bool = False,
) -> Expr:
"""
Convert the Series of type `List` to a Series of type `Struct`.
Expand Down Expand Up @@ -1140,9 +1144,7 @@ def to_struct(
Convert list to struct with default field name assignment:
>>> df = pl.DataFrame({"n": [[0, 1], [0, 1, 2]]})
>>> df.with_columns(
... struct=pl.col("n").list.to_struct()
... ) # doctest: +IGNORE_RESULT
>>> df.with_columns(struct=pl.col("n").list.to_struct()) # doctest: +SKIP
shape: (2, 2)
┌───────────┬───────────┐
│ n ┆ struct │
Expand All @@ -1158,7 +1160,7 @@ def to_struct(
>>> df.with_columns(
... struct=pl.col("n").list.to_struct(n_field_strategy="max_width")
... ) # doctest: +IGNORE_RESULT
... ) # doctest: +SKIP
shape: (2, 2)
┌───────────┬────────────┐
│ n ┆ struct │
Expand All @@ -1174,7 +1176,7 @@ def to_struct(
>>> df = pl.DataFrame({"n": [[0, 1], [2, 3]]})
>>> df.select(pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}")).rows(
... named=True
... )
... ) # doctest: +SKIP
[{'n': {'n0': 0, 'n1': 1}}, {'n': {'n0': 2, 'n1': 3}}]
Convert list to struct with field name assignment by index from a list of names:
Expand All @@ -1188,6 +1190,13 @@ def to_struct(
pyexpr = self._pyexpr.list_to_struct_fixed_width(fields)
return wrap_expr(pyexpr)
else:
if not _eager:
msg = (
"`to_struct()` should be passed a list of field names to avoid "
"query errors in subsequent operations (e.g. <struct operation> "
"not supported for dtype Unknown)"
)
warnings.warn(msg, stacklevel=find_stacklevel())
pyexpr = self._pyexpr.list_to_struct(n_field_strategy, fields, upper_bound)
return wrap_expr(pyexpr)

Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/series/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,7 @@ def to_struct(
n_field_strategy,
fields,
upper_bound=0,
_eager=True,
)
)
.to_series()
Expand Down
20 changes: 9 additions & 11 deletions py-polars/tests/unit/datatypes/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,32 +264,30 @@ def test_from_dicts_struct() -> None:
@pytest.mark.may_fail_auto_streaming
def test_list_to_struct() -> None:
df = pl.DataFrame({"a": [[1, 2, 3], [1, 2]]})
assert df.select([pl.col("a").list.to_struct()]).to_series().to_list() == [
assert df.to_series().list.to_struct().to_list() == [
{"field_0": 1, "field_1": 2, "field_2": 3},
{"field_0": 1, "field_1": 2, "field_2": None},
]

df = pl.DataFrame({"a": [[1, 2], [1, 2, 3]]})
assert df.select(
pl.col("a").list.to_struct(fields=lambda idx: f"col_name_{idx}")
).to_series().to_list() == [
assert df.to_series().list.to_struct(
fields=lambda idx: f"col_name_{idx}"
).to_list() == [
{"col_name_0": 1, "col_name_1": 2},
{"col_name_0": 1, "col_name_1": 2},
]

df = pl.DataFrame({"a": [[1, 2], [1, 2, 3]]})
assert df.select(
pl.col("a").list.to_struct(n_field_strategy="max_width")
).to_series().to_list() == [
assert df.to_series().list.to_struct(n_field_strategy="max_width").to_list() == [
{"field_0": 1, "field_1": 2, "field_2": None},
{"field_0": 1, "field_1": 2, "field_2": 3},
]

# set upper bound
df = pl.DataFrame({"lists": [[1, 1, 1], [0, 1, 0], [1, 0, 0]]})
assert df.lazy().select(pl.col("lists").list.to_struct(upper_bound=3)).unnest(
"lists"
).sum().collect().columns == ["field_0", "field_1", "field_2"]
assert df.lazy().select(
pl.col("lists").list.to_struct(upper_bound=3, _eager=True)
).unnest("lists").sum().collect().columns == ["field_0", "field_1", "field_2"]


def test_sort_df_with_list_struct() -> None:
Expand Down Expand Up @@ -1145,7 +1143,7 @@ def test_list_to_struct_19208() -> None:
}
)
assert pl.concat([df[0], df[1], df[2]]).select(
pl.col("nested").list.to_struct()
pl.col("nested").list.to_struct(_eager=True)
).to_dict(as_series=False) == {
"nested": [{"field_0": {"a": 1}}, {"field_0": None}, {"field_0": {"a": 3}}]
}
Expand Down
18 changes: 10 additions & 8 deletions py-polars/tests/unit/operations/namespaces/list/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,14 +638,14 @@ def test_list_unique2() -> None:
def test_list_to_struct() -> None:
df = pl.DataFrame({"n": [[0, 1, 2], [0, 1]]})

assert df.select(pl.col("n").list.to_struct()).rows(named=True) == [
assert df.select(pl.col("n").list.to_struct(_eager=True)).rows(named=True) == [
{"n": {"field_0": 0, "field_1": 1, "field_2": 2}},
{"n": {"field_0": 0, "field_1": 1, "field_2": None}},
]

assert df.select(pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}")).rows(
named=True
) == [
assert df.select(
pl.col("n").list.to_struct(fields=lambda idx: f"n{idx}", _eager=True)
).rows(named=True) == [
{"n": {"n0": 0, "n1": 1, "n2": 2}},
{"n": {"n0": 0, "n1": 1, "n2": None}},
]
Expand All @@ -668,14 +668,16 @@ def test_list_to_struct() -> None:
# retrieve the lazy schema
# * The upper bound is respected during execution
q = df.lazy().select(
pl.col("n").list.to_struct(fields=str, upper_bound=2).struct.unnest()
pl.col("n")
.list.to_struct(fields=str, upper_bound=2, _eager=True)
.struct.unnest()
)
assert q.collect_schema() == {"0": pl.Int64, "1": pl.Int64}
assert_frame_equal(q.collect(), pl.DataFrame({"0": [0, 0], "1": [1, 1]}))

assert df.lazy().select(pl.col("n").list.to_struct()).collect_schema() == {
"n": pl.Unknown
}
assert df.lazy().select(
pl.col("n").list.to_struct(_eager=True)
).collect_schema() == {"n": pl.Unknown}


def test_select_from_list_to_struct_11143() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ def test_no_panic_pandas_nat() -> None:

def test_list_to_struct_invalid_type() -> None:
with pytest.raises(pl.exceptions.InvalidOperationError):
pl.DataFrame({"a": 1}).select(pl.col("a").list.to_struct())
pl.DataFrame({"a": 1}).to_series().list.to_struct()


def test_raise_invalid_agg() -> None:
Expand Down
4 changes: 3 additions & 1 deletion py-polars/tests/unit/test_projections.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ def test_unnest_columns_available() -> None:
q = df.with_columns(
pl.col("genres")
.str.split("|")
.list.to_struct(n_field_strategy="max_width", fields=lambda i: f"genre{i + 1}")
.list.to_struct(
n_field_strategy="max_width", fields=lambda i: f"genre{i + 1}", _eager=True
)
).unnest("genres")

out = q.collect()
Expand Down

0 comments on commit 588a22b

Please sign in to comment.