From c5797217fc89635735ff68a77bc4a6e42ec0b587 Mon Sep 17 00:00:00 2001 From: Henry Harbeck <59268910+henryharbeck@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:19:20 +1000 Subject: [PATCH] feat(python): Allow mapping as syntactic sugar in `str.replace_many` (#18214) Co-authored-by: Henry Harbeck --- py-polars/polars/api.py | 7 +- py-polars/polars/dataframe/frame.py | 4 - py-polars/polars/expr/string.py | 121 +++++++++++++++--- py-polars/polars/series/string.py | 82 ++++++++++-- .../namespaces/string/test_string.py | 89 ++++++++++++- 5 files changed, 264 insertions(+), 39 deletions(-) diff --git a/py-polars/polars/api.py b/py-polars/polars/api.py index 44ba02084778..84262d1b7998 100644 --- a/py-polars/polars/api.py +++ b/py-polars/polars/api.py @@ -1,7 +1,5 @@ from __future__ import annotations -from functools import reduce -from operator import or_ from typing import TYPE_CHECKING, Callable, Generic, TypeVar from warnings import warn @@ -20,9 +18,8 @@ ] # do not allow override of polars' own namespaces (as registered by '_accessors') -_reserved_namespaces: set[str] = reduce( - or_, - (cls._accessors for cls in (pl.DataFrame, pl.Expr, pl.LazyFrame, pl.Series)), +_reserved_namespaces: set[str] = set.union( + *(cls._accessors for cls in (pl.DataFrame, pl.Expr, pl.LazyFrame, pl.Series)) ) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ff73cdc37c60..d065bc24ce90 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -6959,10 +6959,6 @@ def join( Note that joining on any other expressions than `col` will turn off coalescing. - Returns - ------- - DataFrame - See Also -------- join_asof diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index bf8e739462b2..351c1cdd8565 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1,7 +1,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Mapping import polars._reexport as pl from polars import functions as F @@ -11,7 +11,7 @@ ) from polars._utils.parse import parse_into_expression from polars._utils.unstable import unstable -from polars._utils.various import find_stacklevel +from polars._utils.various import find_stacklevel, no_default from polars._utils.wrap import wrap_expr from polars.datatypes import Date, Datetime, Time, parse_into_dtype from polars.datatypes.constants import N_INFER_DEFAULT @@ -28,6 +28,7 @@ TimeUnit, TransferEncoding, ) + from polars._utils.various import NoDefault class ExprStringNameSpace: @@ -2400,9 +2401,9 @@ def contains_any( self, patterns: IntoExpr, *, ascii_case_insensitive: bool = False ) -> Expr: """ - Use the aho-corasick algorithm to find matches. + Use the Aho-Corasick algorithm to find matches. - This version determines if any of the patterns find a match. + Determines if any of the patterns are contained in the string. Parameters ---------- @@ -2413,6 +2414,11 @@ def contains_any( When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. + Notes + ----- + This method supports matching on string literals only, and does not support + regular expression matching. + Examples -------- >>> _ = pl.Config.set_fmt_str_lengths(100) @@ -2448,29 +2454,75 @@ def contains_any( def replace_many( self, - patterns: IntoExpr, - replace_with: IntoExpr, + patterns: IntoExpr | Mapping[str, str], + replace_with: IntoExpr | NoDefault = no_default, *, ascii_case_insensitive: bool = False, ) -> Expr: """ - - Use the aho-corasick algorithm to replace many matches. + Use the Aho-Corasick algorithm to replace many matches. Parameters ---------- patterns String patterns to search and replace. + Accepts expression input. Strings are parsed as column names, and other + non-expression inputs are parsed as literals. Also accepts a mapping of + patterns to their replacement as syntactic sugar for + `replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`. replace_with Strings to replace where a pattern was a match. - This can be broadcast, so it supports many:one and many:many. + Accepts expression input. Non-expression inputs are parsed as literals. + Length must match the length of `patterns` or have length 1. This can be + broadcasted, so it supports many:one and many:many. ascii_case_insensitive Enable ASCII-aware case-insensitive matching. When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. + Notes + ----- + This method supports matching on string literals only, and does not support + regular expression matching. + Examples -------- + Replace many patterns by passing sequences of equal length to the `patterns` and + `replace_with` parameters. + + >>> _ = pl.Config.set_fmt_str_lengths(100) + >>> _ = pl.Config.set_tbl_width_chars(110) + >>> df = pl.DataFrame( + ... { + ... "lyrics": [ + ... "Everybody wants to rule the world", + ... "Tell me what you want, what you really really want", + ... "Can you feel the love tonight", + ... ] + ... } + ... ) + >>> df.with_columns( + ... pl.col("lyrics") + ... .str.replace_many( + ... ["me", "you"], + ... ["you", "me"], + ... ) + ... .alias("confusing") + ... ) + shape: (3, 2) + ┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐ + │ lyrics ┆ confusing │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡ + │ Everybody wants to rule the world ┆ Everybody wants to rule the world │ + │ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want │ + │ Can you feel the love tonight ┆ Can me feel the love tonight │ + └────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘ + + Broadcast a replacement for many patterns by passing a string or a sequence of + length 1 to the `replace_with` parameter. + >>> _ = pl.Config.set_fmt_str_lengths(100) >>> df = pl.DataFrame( ... { @@ -2499,27 +2551,50 @@ def replace_many( │ Tell me what you want, what you really really want ┆ Tell what want, what really really want │ │ Can you feel the love tonight ┆ Can feel the love tonight │ └────────────────────────────────────────────────────┴────────────────────────────────────────────┘ + + Passing a mapping with patterns and replacements is also supported as syntactic + sugar. + + >>> _ = pl.Config.set_fmt_str_lengths(100) + >>> _ = pl.Config.set_tbl_width_chars(110) + >>> df = pl.DataFrame( + ... { + ... "lyrics": [ + ... "Everybody wants to rule the world", + ... "Tell me what you want, what you really really want", + ... "Can you feel the love tonight", + ... ] + ... } + ... ) + >>> mapping = {"me": "you", "you": "me", "want": "need"} >>> df.with_columns( - ... pl.col("lyrics") - ... .str.replace_many( - ... ["me", "you"], - ... ["you", "me"], - ... ) - ... .alias("confusing") - ... ) # doctest: +IGNORE_RESULT + ... pl.col("lyrics").str.replace_many(mapping).alias("confusing") + ... ) shape: (3, 2) ┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐ │ lyrics ┆ confusing │ │ --- ┆ --- │ │ str ┆ str │ ╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡ - │ Everybody wants to rule the world ┆ Everybody wants to rule the world │ - │ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want │ + │ Everybody wants to rule the world ┆ Everybody needs to rule the world │ + │ Tell me what you want, what you really really want ┆ Tell you what me need, what me really really need │ │ Can you feel the love tonight ┆ Can me feel the love tonight │ └────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘ """ # noqa: W505 + if replace_with is no_default: + if not isinstance(patterns, Mapping): + msg = "`replace_with` argument is required if `patterns` argument is not a Mapping type" + raise TypeError(msg) + # Early return in case of an empty mapping. + if not patterns: + return wrap_expr(self._pyexpr) + replace_with = pl.Series(patterns.values()) + patterns = pl.Series(patterns.keys()) + patterns = parse_into_expression( - patterns, str_as_lit=False, list_as_series=True + patterns, # type: ignore[arg-type] + str_as_lit=False, + list_as_series=True, ) replace_with = parse_into_expression( replace_with, str_as_lit=True, list_as_series=True @@ -2539,8 +2614,7 @@ def extract_many( overlapping: bool = False, ) -> Expr: """ - - Use the aho-corasick algorithm to extract many matches. + Use the Aho-Corasick algorithm to extract many matches. Parameters ---------- @@ -2553,6 +2627,11 @@ def extract_many( overlapping Whether matches may overlap. + Notes + ----- + This method supports matching on string literals only, and does not support + regular expression matching. + Examples -------- >>> _ = pl.Config.set_fmt_str_lengths(100) diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index cb6d9fbb999d..953803fc1253 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1,9 +1,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Mapping from polars._utils.deprecation import deprecate_function from polars._utils.unstable import unstable +from polars._utils.various import no_default from polars.datatypes.constants import N_INFER_DEFAULT from polars.series.utils import expr_dispatch @@ -18,6 +19,7 @@ TimeUnit, TransferEncoding, ) + from polars._utils.various import NoDefault from polars.polars import PySeries @@ -1818,9 +1820,9 @@ def contains_any( self, patterns: Series | list[str], *, ascii_case_insensitive: bool = False ) -> Series: """ - Use the aho-corasick algorithm to find matches. + Use the Aho-Corasick algorithm to find matches. - This version determines if any of the patterns find a match. + Determines if any of the patterns are contained in the string. Parameters ---------- @@ -1831,6 +1833,11 @@ def contains_any( When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. + Notes + ----- + This method supports matching on string literals only, and does not support + regular expression matching. + Examples -------- >>> _ = pl.Config.set_fmt_str_lengths(100) @@ -1854,28 +1861,39 @@ def contains_any( def replace_many( self, - patterns: Series | list[str], - replace_with: Series | list[str] | str, + patterns: Series | list[str] | Mapping[str, str], + replace_with: Series | list[str] | str | NoDefault = no_default, *, ascii_case_insensitive: bool = False, ) -> Series: """ - Use the aho-corasick algorithm to replace many matches. + Use the Aho-Corasick algorithm to replace many matches. Parameters ---------- patterns String patterns to search and replace. + Also accepts a mapping of patterns to their replacement as syntactic sugar + for `replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`. replace_with Strings to replace where a pattern was a match. - This can be broadcast, so it supports many:one and many:many. + Length must match the length of `patterns` or have length 1. This can be + broadcasted, so it supports many:one and many:many. ascii_case_insensitive Enable ASCII-aware case-insensitive matching. When this option is enabled, searching will be performed without respect to case for ASCII letters (a-z and A-Z) only. + Notes + ----- + This method supports matching on string literals only, and does not support + regular expression matching. + Examples -------- + Replace many patterns by passing lists of equal length to the `patterns` and + `replace_with` parameters. + >>> _ = pl.Config.set_fmt_str_lengths(100) >>> s = pl.Series( ... "lyrics", @@ -1893,6 +1911,49 @@ def replace_many( "Tell you what me want, what me really really want" "Can me feel the love tonight" ] + + Broadcast a replacement for many patterns by passing a string or a sequence of + length 1 to the `replace_with` parameter. + + >>> _ = pl.Config.set_fmt_str_lengths(100) + >>> s = pl.Series( + ... "lyrics", + ... [ + ... "Everybody wants to rule the world", + ... "Tell me what you want, what you really really want", + ... "Can you feel the love tonight", + ... ], + ... ) + >>> s.str.replace_many(["me", "you", "they"], "") + shape: (3,) + Series: 'lyrics' [str] + [ + "Everybody wants to rule the world" + "Tell what want, what really really want" + "Can feel the love tonight" + ] + + Passing a mapping with patterns and replacements is also supported as syntactic + sugar. + + >>> _ = pl.Config.set_fmt_str_lengths(100) + >>> s = pl.Series( + ... "lyrics", + ... [ + ... "Everybody wants to rule the world", + ... "Tell me what you want, what you really really want", + ... "Can you feel the love tonight", + ... ], + ... ) + >>> mapping = {"me": "you", "you": "me", "want": "need"} + >>> s.str.replace_many(mapping) + shape: (3,) + Series: 'lyrics' [str] + [ + "Everybody needs to rule the world" + "Tell you what me need, what me really really need" + "Can me feel the love tonight" + ] """ @unstable() @@ -1904,7 +1965,7 @@ def extract_many( overlapping: bool = False, ) -> Series: """ - Use the aho-corasick algorithm to extract many matches. + Use the Aho-Corasick algorithm to extract many matches. Parameters ---------- @@ -1917,6 +1978,11 @@ def extract_many( overlapping Whether matches may overlap. + Notes + ----- + This method supports matching on string literals only, and does not support + regular expression matching. + Examples -------- >>> s = pl.Series("values", ["discontent"]) diff --git a/py-polars/tests/unit/operations/namespaces/string/test_string.py b/py-polars/tests/unit/operations/namespaces/string/test_string.py index 7567d1557f36..fe47b8d07d2e 100644 --- a/py-polars/tests/unit/operations/namespaces/string/test_string.py +++ b/py-polars/tests/unit/operations/namespaces/string/test_string.py @@ -4,7 +4,12 @@ import polars as pl import polars.selectors as cs -from polars.exceptions import ComputeError, InvalidOperationError +from polars.exceptions import ( + ColumnNotFoundError, + ComputeError, + InvalidOperationError, + SchemaError, +) from polars.testing import assert_frame_equal, assert_series_equal @@ -1061,6 +1066,88 @@ def test_replace_many( ) +@pytest.mark.parametrize( + ("mapping", "case_insensitive", "expected"), + [ + ({}, False, "Tell me what you want"), + ({"me": "them"}, False, "Tell them what you want"), + ({"who": "them"}, False, "Tell me what you want"), + ({"me": "it", "you": "it"}, False, "Tell it what it want"), + ({"Me": "it", "you": "it"}, False, "Tell me what it want"), + ({"me": "you", "you": "me"}, False, "Tell you what me want"), + ({}, True, "Tell me what you want"), + ({"Me": "it", "you": "it"}, True, "Tell it what it want"), + ({"me": "you", "YOU": "me"}, True, "Tell you what me want"), + ], +) +def test_replace_many_mapping( + mapping: dict[str, str], + case_insensitive: bool, + expected: str, +) -> None: + df = pl.DataFrame({"text": ["Tell me what you want"]}) + # series + assert ( + expected + == df["text"] + .str.replace_many(mapping, ascii_case_insensitive=case_insensitive) + .item() + ) + # expr + assert ( + expected + == df.select( + pl.col("text").str.replace_many( + mapping, + ascii_case_insensitive=case_insensitive, + ) + ).item() + ) + + +def test_replace_many_invalid_inputs() -> None: + df = pl.DataFrame({"text": ["Tell me what you want"]}) + + # Ensure a string as the first argument is parsed as a column name. + with pytest.raises(ColumnNotFoundError, match="me"): + df.select(pl.col("text").str.replace_many("me", "you")) + + with pytest.raises(SchemaError): + df.select(pl.col("text").str.replace_many(1, 2)) + + with pytest.raises(SchemaError): + df.select(pl.col("text").str.replace_many([1], [2])) + + with pytest.raises(SchemaError): + df.select(pl.col("text").str.replace_many(["me"], None)) + + with pytest.raises(TypeError): + df.select(pl.col("text").str.replace_many(["me"])) + + with pytest.raises( + InvalidOperationError, + match="expected the same amount of patterns as replacement strings", + ): + df.select(pl.col("text").str.replace_many(["a"], ["b", "c"])) + + s = df.to_series() + + with pytest.raises(ColumnNotFoundError, match="me"): + s.str.replace_many("me", "you") # type: ignore[arg-type] + + with pytest.raises(SchemaError): + df.select(pl.col("text").str.replace_many(["me"], None)) + + with pytest.raises(TypeError): + df.select(pl.col("text").str.replace_many(["me"])) + + with pytest.raises( + InvalidOperationError, + match="expected the same amount of patterns as replacement strings", + ): + s.str.replace_many(["a"], ["b", "c"]) + + def test_extract_all_count() -> None: df = pl.DataFrame({"foo": ["123 bla 45 asd", "xaz 678 910t", "boo", None]}) assert (