Skip to content

Commit

Permalink
feat(python): Allow mapping as syntactic sugar in str.replace_many (#…
Browse files Browse the repository at this point in the history
…18214)

Co-authored-by: Henry Harbeck <[email protected]>
  • Loading branch information
henryharbeck and Henry Harbeck authored Aug 27, 2024
1 parent 950e4c3 commit c579721
Show file tree
Hide file tree
Showing 5 changed files with 264 additions and 39 deletions.
7 changes: 2 additions & 5 deletions py-polars/polars/api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from functools import reduce
from operator import or_
from typing import TYPE_CHECKING, Callable, Generic, TypeVar
from warnings import warn

Expand All @@ -20,9 +18,8 @@
]

# do not allow override of polars' own namespaces (as registered by '_accessors')
_reserved_namespaces: set[str] = reduce(
or_,
(cls._accessors for cls in (pl.DataFrame, pl.Expr, pl.LazyFrame, pl.Series)),
_reserved_namespaces: set[str] = set.union(
*(cls._accessors for cls in (pl.DataFrame, pl.Expr, pl.LazyFrame, pl.Series))
)


Expand Down
4 changes: 0 additions & 4 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6959,10 +6959,6 @@ def join(
Note that joining on any other expressions than `col`
will turn off coalescing.
Returns
-------
DataFrame
See Also
--------
join_asof
Expand Down
121 changes: 100 additions & 21 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import warnings
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Mapping

import polars._reexport as pl
from polars import functions as F
Expand All @@ -11,7 +11,7 @@
)
from polars._utils.parse import parse_into_expression
from polars._utils.unstable import unstable
from polars._utils.various import find_stacklevel
from polars._utils.various import find_stacklevel, no_default
from polars._utils.wrap import wrap_expr
from polars.datatypes import Date, Datetime, Time, parse_into_dtype
from polars.datatypes.constants import N_INFER_DEFAULT
Expand All @@ -28,6 +28,7 @@
TimeUnit,
TransferEncoding,
)
from polars._utils.various import NoDefault


class ExprStringNameSpace:
Expand Down Expand Up @@ -2400,9 +2401,9 @@ def contains_any(
self, patterns: IntoExpr, *, ascii_case_insensitive: bool = False
) -> Expr:
"""
Use the aho-corasick algorithm to find matches.
Use the Aho-Corasick algorithm to find matches.
This version determines if any of the patterns find a match.
Determines if any of the patterns are contained in the string.
Parameters
----------
Expand All @@ -2413,6 +2414,11 @@ def contains_any(
When this option is enabled, searching will be performed without respect
to case for ASCII letters (a-z and A-Z) only.
Notes
-----
This method supports matching on string literals only, and does not support
regular expression matching.
Examples
--------
>>> _ = pl.Config.set_fmt_str_lengths(100)
Expand Down Expand Up @@ -2448,29 +2454,75 @@ def contains_any(

def replace_many(
self,
patterns: IntoExpr,
replace_with: IntoExpr,
patterns: IntoExpr | Mapping[str, str],
replace_with: IntoExpr | NoDefault = no_default,
*,
ascii_case_insensitive: bool = False,
) -> Expr:
"""
Use the aho-corasick algorithm to replace many matches.
Use the Aho-Corasick algorithm to replace many matches.
Parameters
----------
patterns
String patterns to search and replace.
Accepts expression input. Strings are parsed as column names, and other
non-expression inputs are parsed as literals. Also accepts a mapping of
patterns to their replacement as syntactic sugar for
`replace_many(pl.Series(mapping.keys()), pl.Series(mapping.values()))`.
replace_with
Strings to replace where a pattern was a match.
This can be broadcast, so it supports many:one and many:many.
Accepts expression input. Non-expression inputs are parsed as literals.
Length must match the length of `patterns` or have length 1. This can be
broadcasted, so it supports many:one and many:many.
ascii_case_insensitive
Enable ASCII-aware case-insensitive matching.
When this option is enabled, searching will be performed without respect
to case for ASCII letters (a-z and A-Z) only.
Notes
-----
This method supports matching on string literals only, and does not support
regular expression matching.
Examples
--------
Replace many patterns by passing sequences of equal length to the `patterns` and
`replace_with` parameters.
>>> _ = pl.Config.set_fmt_str_lengths(100)
>>> _ = pl.Config.set_tbl_width_chars(110)
>>> df = pl.DataFrame(
... {
... "lyrics": [
... "Everybody wants to rule the world",
... "Tell me what you want, what you really really want",
... "Can you feel the love tonight",
... ]
... }
... )
>>> df.with_columns(
... pl.col("lyrics")
... .str.replace_many(
... ["me", "you"],
... ["you", "me"],
... )
... .alias("confusing")
... )
shape: (3, 2)
┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
│ lyrics ┆ confusing │
│ --- ┆ --- │
│ str ┆ str │
╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
│ Everybody wants to rule the world ┆ Everybody wants to rule the world │
│ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want │
│ Can you feel the love tonight ┆ Can me feel the love tonight │
└────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
Broadcast a replacement for many patterns by passing a string or a sequence of
length 1 to the `replace_with` parameter.
>>> _ = pl.Config.set_fmt_str_lengths(100)
>>> df = pl.DataFrame(
... {
Expand Down Expand Up @@ -2499,27 +2551,50 @@ def replace_many(
│ Tell me what you want, what you really really want ┆ Tell what want, what really really want │
│ Can you feel the love tonight ┆ Can feel the love tonight │
└────────────────────────────────────────────────────┴────────────────────────────────────────────┘
Passing a mapping with patterns and replacements is also supported as syntactic
sugar.
>>> _ = pl.Config.set_fmt_str_lengths(100)
>>> _ = pl.Config.set_tbl_width_chars(110)
>>> df = pl.DataFrame(
... {
... "lyrics": [
... "Everybody wants to rule the world",
... "Tell me what you want, what you really really want",
... "Can you feel the love tonight",
... ]
... }
... )
>>> mapping = {"me": "you", "you": "me", "want": "need"}
>>> df.with_columns(
... pl.col("lyrics")
... .str.replace_many(
... ["me", "you"],
... ["you", "me"],
... )
... .alias("confusing")
... ) # doctest: +IGNORE_RESULT
... pl.col("lyrics").str.replace_many(mapping).alias("confusing")
... )
shape: (3, 2)
┌────────────────────────────────────────────────────┬───────────────────────────────────────────────────┐
│ lyrics ┆ confusing │
│ --- ┆ --- │
│ str ┆ str │
╞════════════════════════════════════════════════════╪═══════════════════════════════════════════════════╡
│ Everybody wants to rule the world ┆ Everybody wants to rule the world │
│ Tell me what you want, what you really really want ┆ Tell you what me want, what me really really want
│ Everybody wants to rule the world ┆ Everybody needs to rule the world │
│ Tell me what you want, what you really really want ┆ Tell you what me need, what me really really need
│ Can you feel the love tonight ┆ Can me feel the love tonight │
└────────────────────────────────────────────────────┴───────────────────────────────────────────────────┘
""" # noqa: W505
if replace_with is no_default:
if not isinstance(patterns, Mapping):
msg = "`replace_with` argument is required if `patterns` argument is not a Mapping type"
raise TypeError(msg)
# Early return in case of an empty mapping.
if not patterns:
return wrap_expr(self._pyexpr)
replace_with = pl.Series(patterns.values())
patterns = pl.Series(patterns.keys())

patterns = parse_into_expression(
patterns, str_as_lit=False, list_as_series=True
patterns, # type: ignore[arg-type]
str_as_lit=False,
list_as_series=True,
)
replace_with = parse_into_expression(
replace_with, str_as_lit=True, list_as_series=True
Expand All @@ -2539,8 +2614,7 @@ def extract_many(
overlapping: bool = False,
) -> Expr:
"""
Use the aho-corasick algorithm to extract many matches.
Use the Aho-Corasick algorithm to extract many matches.
Parameters
----------
Expand All @@ -2553,6 +2627,11 @@ def extract_many(
overlapping
Whether matches may overlap.
Notes
-----
This method supports matching on string literals only, and does not support
regular expression matching.
Examples
--------
>>> _ = pl.Config.set_fmt_str_lengths(100)
Expand Down
Loading

0 comments on commit c579721

Please sign in to comment.