From 7763bd48e7615480e9b1731ee272b7aa2313e153 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Tue, 25 Jun 2024 13:57:19 -0400 Subject: [PATCH] docs(python): More accurate and helpful docs for user defined functions (#15194) Co-authored-by: Itamar Turner-Trauring Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- docs/requirements.txt | 1 + .../python/user-guide/expressions/structs.py | 12 ++ .../expressions/user-defined-functions.py | 117 ++++++++---- .../rust/user-guide/expressions/structs.rs | 49 +++++ .../expressions/user-defined-functions.rs | 88 +-------- docs/user-guide/expressions/numpy.md | 4 +- docs/user-guide/expressions/structs.md | 9 +- .../expressions/user-defined-functions.md | 177 ++++++++---------- py-polars/requirements-dev.txt | 1 + py-polars/tests/docs/test_user_guide.py | 3 + 10 files changed, 243 insertions(+), 218 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 0922ff44d8b..072c07aad41 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,6 +5,7 @@ matplotlib seaborn plotly altair +numba # Unpin NumPy when support is implemented in numpy crate: # https://github.com/pola-rs/polars/issues/16998 numpy<2 diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py index ee034a362bc..01e21cca25b 100644 --- a/docs/src/python/user-guide/expressions/structs.py +++ b/docs/src/python/user-guide/expressions/structs.py @@ -64,3 +64,15 @@ ).filter(pl.struct("Movie", "Theatre").is_duplicated()) print(out) # --8<-- [end:struct_ranking] + +# --8<-- [start:multi_column_apply] +df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]}) + +out = df.select( + pl.struct(["keys", "values"]) + .map_elements(lambda x: len(x["keys"]) + x["values"]) + .alias("solution_map_elements"), + (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), +) +print(out) +# --8<-- [end:multi_column_apply] diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 6c248691e1a..a436a6d8241 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -7,59 +7,104 @@ # --8<-- [start:dataframe] df = pl.DataFrame( { - "keys": ["a", "a", "b"], - "values": [10, 7, 1], + "keys": ["a", "a", "b", "b"], + "values": [10, 7, 1, 23], } ) print(df) # --8<-- [end:dataframe] -# --8<-- [start:shift_map_batches] -out = df.group_by("keys", maintain_order=True).agg( - pl.col("values") - .map_batches(lambda s: s.shift(), is_elementwise=True) - .alias("shift_map_batches"), - pl.col("values").shift().alias("shift_expression"), -) +# --8<-- [start:individual_log] +import math + + +def my_log(value): + return math.log(value) + + +out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64)) print(out) -# --8<-- [end:shift_map_batches] +# --8<-- [end:individual_log] -# --8<-- [start:map_elements] -out = df.group_by("keys", maintain_order=True).agg( - pl.col("values") - .map_elements(lambda s: s.shift(), return_dtype=pl.List(int)) - .alias("shift_map_elements"), - pl.col("values").shift().alias("shift_expression"), -) +# --8<-- [start:diff_from_mean] +def diff_from_mean(series): + # This will be very slow for non-trivial Series, since it's all Python + # code: + total = 0 + for value in series: + total += value + mean = total / len(series) + return pl.Series([value - mean for value in series]) + + +# Apply our custom function to a full Series with map_batches(): +out = df.select(pl.col("values").map_batches(diff_from_mean)) +print("== select() with UDF ==") +print(out) + +# Apply our custom function per group: +print("== group_by() with UDF ==") +out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean)) print(out) -# --8<-- [end:map_elements] +# --8<-- [end:diff_from_mean] -# --8<-- [start:counter] -counter = 0 +# --8<-- [start:np_log] +import numpy as np +out = df.select(pl.col("values").map_batches(np.log)) +print(out) +# --8<-- [end:np_log] -def add_counter(val: int) -> int: - global counter - counter += 1 - return counter + val +# --8<-- [start:diff_from_mean_numba] +from numba import guvectorize, int64, float64 -out = df.select( - pl.col("values") - .map_elements(add_counter, return_dtype=pl.Int64) - .alias("solution_map_elements"), - (pl.col("values") + pl.int_range(1, pl.len() + 1)).alias("solution_expr"), -) +# This will be compiled to machine code, so it will be fast. The Series is +# converted to a NumPy array before being passed to the function. See the +# Numba documentation for more details: +# https://numba.readthedocs.io/en/stable/user/vectorize.html +@guvectorize([(int64[:], float64[:])], "(n)->(n)") +def diff_from_mean_numba(arr, result): + total = 0 + for value in arr: + total += value + mean = total / len(arr) + for i, value in enumerate(arr): + result[i] = value - mean + + +out = df.select(pl.col("values").map_batches(diff_from_mean_numba)) +print("== select() with UDF ==") +print(out) + +out = df.group_by("keys").agg(pl.col("values").map_batches(diff_from_mean_numba)) +print("== group_by() with UDF ==") print(out) -# --8<-- [end:counter] +# --8<-- [end:diff_from_mean_numba] + # --8<-- [start:combine] -out = df.select( - pl.struct("keys", "values") - .map_elements(lambda x: len(x["keys"]) + x["values"], return_dtype=pl.Int64) - .alias("solution_map_elements"), - (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), +# Add two arrays together: +@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)") +def add(arr, arr2, result): + for i in range(len(arr)): + result[i] = arr[i] + arr2[i] + + +df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]}) + +out = df3.select( + # Create a struct that has two columns in it: + pl.struct(["values1", "values2"]) + # Pass the struct to a lambda that then passes the individual columns to + # the add() function: + .map_batches( + lambda combined: add( + combined.struct.field("values1"), combined.struct.field("values2") + ) + ) + .alias("add_columns") ) print(out) # --8<-- [end:combine] diff --git a/docs/src/rust/user-guide/expressions/structs.rs b/docs/src/rust/user-guide/expressions/structs.rs index abb05e99ad5..b064d2c4166 100644 --- a/docs/src/rust/user-guide/expressions/structs.rs +++ b/docs/src/rust/user-guide/expressions/structs.rs @@ -95,5 +95,54 @@ fn main() -> Result<(), Box> { println!("{}", &out); // --8<-- [end:struct_ranking] + // --8<-- [start:multi_column_apply] + let df = df!( + "keys" => &["a", "a", "b"], + "values" => &[10, 7, 1], + )?; + + let out = df + .lazy() + .select([ + // pack to struct to get access to multiple fields in a custom `apply/map` + as_struct(vec![col("keys"), col("values")]) + // we will compute the len(a) + b + .apply( + |s| { + // downcast to struct + let ca = s.struct_()?; + + // get the fields as Series + let s_a = &ca.fields()[0]; + let s_b = &ca.fields()[1]; + + // downcast the `Series` to their known type + let ca_a = s_a.str()?; + let ca_b = s_b.i32()?; + + // iterate both `ChunkedArrays` + let out: Int32Chunked = ca_a + .into_iter() + .zip(ca_b) + .map(|(opt_a, opt_b)| match (opt_a, opt_b) { + (Some(a), Some(b)) => Some(a.len() as i32 + b), + _ => None, + }) + .collect(); + + Ok(Some(out.into_series())) + }, + GetOutput::from_type(DataType::Int32), + ) + // note: the `'solution_map_elements'` alias is just there to show how you + // get the same output as in the Python API example. + .alias("solution_map_elements"), + (col("keys").str().count_matches(lit("."), true) + col("values")) + .alias("solution_expr"), + ]) + .collect()?; + println!("{}", out); + + // --8<-- [end:multi_column_apply] Ok(()) } diff --git a/docs/src/rust/user-guide/expressions/user-defined-functions.rs b/docs/src/rust/user-guide/expressions/user-defined-functions.rs index 56661fcabc8..b83898ef6c7 100644 --- a/docs/src/rust/user-guide/expressions/user-defined-functions.rs +++ b/docs/src/rust/user-guide/expressions/user-defined-functions.rs @@ -3,93 +3,25 @@ use polars::prelude::*; fn main() -> Result<(), Box> { // --8<-- [start:dataframe] let df = df!( - "keys" => &["a", "a", "b"], - "values" => &[10, 7, 1], + "keys" => &["a", "a", "b", "b"], + "values" => &[10, 7, 1, 23], )?; println!("{}", df); // --8<-- [end:dataframe] - // --8<-- [start:shift_map_batches] - let out = df - .clone() - .lazy() - .group_by(["keys"]) - .agg([ - col("values") - .map(|s| Ok(Some(s.shift(1))), GetOutput::default()) - // note: the `'shift_map_batches'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("shift_map_batches"), - col("values").shift(lit(1)).alias("shift_expression"), - ]) - .collect()?; + // --8<-- [start:individual_log] + // --8<-- [end:individual_log] - println!("{}", out); - // --8<-- [end:shift_map_batches] + // --8<-- [start:diff_from_mean] + // --8<-- [end:diff_from_mean] - // --8<-- [start:map_elements] - let out = df - .clone() - .lazy() - .group_by([col("keys")]) - .agg([ - col("values") - .apply(|s| Ok(Some(s.shift(1))), GetOutput::default()) - // note: the `'shift_map_elements'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("shift_map_elements"), - col("values").shift(lit(1)).alias("shift_expression"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:map_elements] + // --8<-- [start:np_log] + // --8<-- [end:np_log] - // --8<-- [start:counter] - - // --8<-- [end:counter] + // --8<-- [start:diff_from_mean_numba] + // --8<-- [end:diff_from_mean_numba] // --8<-- [start:combine] - let out = df - .lazy() - .select([ - // pack to struct to get access to multiple fields in a custom `apply/map` - as_struct(vec![col("keys"), col("values")]) - // we will compute the len(a) + b - .apply( - |s| { - // downcast to struct - let ca = s.struct_()?; - - // get the fields as Series - let s_a = &ca.fields()[0]; - let s_b = &ca.fields()[1]; - - // downcast the `Series` to their known type - let ca_a = s_a.str()?; - let ca_b = s_b.i32()?; - - // iterate both `ChunkedArrays` - let out: Int32Chunked = ca_a - .into_iter() - .zip(ca_b) - .map(|(opt_a, opt_b)| match (opt_a, opt_b) { - (Some(a), Some(b)) => Some(a.len() as i32 + b), - _ => None, - }) - .collect(); - - Ok(Some(out.into_series())) - }, - GetOutput::from_type(DataType::Int32), - ) - // note: the `'solution_map_elements'` alias is just there to show how you - // get the same output as in the Python API example. - .alias("solution_map_elements"), - (col("keys").str().count_matches(lit("."), true) + col("values")) - .alias("solution_expr"), - ]) - .collect()?; - println!("{}", out); // --8<-- [end:combine] Ok(()) } diff --git a/docs/user-guide/expressions/numpy.md b/docs/user-guide/expressions/numpy.md index 6500e87b520..4a5a46978b5 100644 --- a/docs/user-guide/expressions/numpy.md +++ b/docs/user-guide/expressions/numpy.md @@ -15,8 +15,8 @@ This means that if a function is not provided by Polars, we can use NumPy and we ### Interoperability -Polars `Series` have support for NumPy universal functions (ufuncs). Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. +Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. -However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results. +However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc. Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion. diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index 056c1b2e21b..d692c05ad0a 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -96,4 +96,11 @@ That's a pretty complex set of requirements done very elegantly in Polars! ### Using multi-column apply -This was discussed in the previous section on _User Defined Functions_. +This was discussed in the previous section on _User Defined Functions_ for the Python case. +Here's an example of doing so with both Python and Rust: + +{{code_block('user-guide/expressions/structs','multi_column_apply',[])}} + +```python exec="on" result="text" session="user-guide/structs" +--8<-- "python/user-guide/expressions/structs.py:multi_column_apply" +``` diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 67c618c220f..dc994148c63 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -1,47 +1,17 @@ # User-defined functions (Python) -You should be convinced by now that Polars expressions are so powerful and flexible that there is much less need for custom Python functions -than in other libraries. +Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries. +Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars. -Still, you need to have the power to be able to pass an expression's state to a third party library or apply your black box function -over data in Polars. +In this part of the documentation we'll be using two APIs that allows you to do this: -For this we provide the following expressions: +- [:material-api: `map_elements`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_elements.html): Call a function separately on each value in the `Series`. +- [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html): Always passes the full `Series` to the function. -- `map_batches` -- `map_elements` +## Processing individual values with `map_elements()` -## To `map_batches` or to `map_elements`. - -These functions have an important distinction in how they operate and consequently what data they will pass to the user. - -A `map_batches` passes the `Series` backed by the `expression` as is. - -`map_batches` follows the same rules in both the `select` and the `group_by` context, this will -mean that the `Series` represents a column in a `DataFrame`. Note that in the `group_by` context, that column is not yet -aggregated! - -Use cases for `map_batches` are for instance passing the `Series` in an expression to a third party library. Below we show how -we could use `map_batches` to pass an expression column to a neural network model. - -=== ":fontawesome-brands-python: Python" -[:material-api: `map_batches`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_batches.html) - -```python -df.with_columns([ - pl.col("features").map_batches(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") -]) -``` - -=== ":fontawesome-brands-rust: Rust" - -```rust -df.with_columns([ - col("features").map(|s| Ok(my_nn.forward(s))).alias("activations") -]) -``` - -Use cases for `map_batches` in the `group_by` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why. +Let's start with the simplest case: we want to process each value in a `Series` individually. +Here is our data: {{code_block('user-guide/expressions/user-defined-functions','dataframe',[])}} @@ -50,94 +20,92 @@ Use cases for `map_batches` in the `group_by` context are slim. They are only us --8<-- "python/user-guide/expressions/user-defined-functions.py:dataframe" ``` -In the snippet above we group by the `"keys"` column. That means we have the following groups: - -```c -"a" -> [10, 7] -"b" -> [1] -``` +We'll call `math.log()` on each individual value: -If we would then apply a `shift` operation to the right, we'd expect: +{{code_block('user-guide/expressions/user-defined-functions','individual_log',[])}} -```c -"a" -> [null, 10] -"b" -> [null] +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:individual_log" ``` -Let's try that out and see what we get: +While this works, `map_elements()` has two problems: -{{code_block('user-guide/expressions/user-defined-functions','shift_map_batches',[])}} +1. **Limited to individual items:** Often you'll want to have a calculation that needs to operate on the whole `Series`, rather than individual items one by one. +2. **Performance overhead:** Even if you do want to process each item individually, calling a function for each individual item is slow; all those extra function calls add a lot of overhead. -```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:shift_map_batches" -``` +Let's start by solving the first problem, and then we'll see how to solve the second problem. -Ouch.. we clearly get the wrong results here. Group `"b"` even got a value from group `"a"` 😵. +## Processing a whole `Series` with `map_batches()` -This went horribly wrong because `map_batches` applied the function before aggregation, due to the `is_elementwise=True` parameter being provided. So that means the whole column `[10, 7, 1]` got shifted to `[null, 10, 7]` and was then aggregated. +We want to run a custom function on the contents of a whole `Series`. +For demonstration purposes, let's say we want to calculate the difference between the mean of a `Series` and each value. -So my advice is to never use `map_batches` in the `group_by` context unless you know you need it and know what you are doing. +We can use the `map_batches()` API to run this function on either the full `Series` or individual groups in a `group_by()`: -## To `map_elements` +{{code_block('user-guide/expressions/user-defined-functions','diff_from_mean',[])}} -Luckily we can fix previous example with `map_elements`. `map_elements` works on the smallest logical elements for that operation. +```python exec="on" result="text" session="user-guide/udf" +--8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean" +``` -That is: +## Fast operations with user-defined functions -- `select context` -> single elements -- `group by context` -> single groups +The problem with a pure-Python implementation is that it's slow. +In general, you want to minimize how much Python code you call if you want fast results. -So with `map_elements` we should be able to fix our example: +To maximize speed, you'll want to make sure that you're using a function written in a compiled language. +For numeric calculations Polars supports a pair of interfaces defined by NumPy called ["ufuncs"](https://numpy.org/doc/stable/reference/ufuncs.html) and ["generalized ufuncs"](https://numpy.org/neps/nep-0005-generalized-ufuncs.html). +The former runs on each item individually, and the latter accepts a whole NumPy array, which allows for more flexible operations. -=== ":fontawesome-brands-python: Python" -[:material-api: `map_elements`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_elements.html) +[NumPy](https://numpy.org/doc/stable/reference/ufuncs.html) and other libraries like [SciPy](https://docs.scipy.org/doc/scipy/reference/special.html#module-scipy.special) come with pre-written ufuncs you can use with Polars. +For example: -{{code_block('user-guide/expressions/user-defined-functions','map_elements',[])}} +{{code_block('user-guide/expressions/user-defined-functions','np_log',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:map_elements" +--8<-- "python/user-guide/expressions/user-defined-functions.py:np_log" ``` -And observe, a valid result! 🎉 - -## `map_elements` in the `select` context +Notice that we can use `map_batches()`, because `numpy.log()` is able to run on both individual items and on whole NumPy arrays. +This means it will run much faster than our original example, since we only have a single Python call and then all processing happens in a fast low-level language. -In the `select` context, the `map_elements` expression passes elements of the column to the Python function. +## Example: A fast custom function using Numba -_Note that you are now running Python, this will be slow._ +The pre-written functions NumPy provides are helpful, but our goal is to write our own functions. +For example, let's say we want a fast version of our `diff_from_mean()` example above. +The easiest way to write this in Python is to use [Numba](https://numba.readthedocs.io/en/stable/), which allows you to write custom functions in (a subset) of Python while still getting the benefit of compiled code. -Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of -this section and show an example with the `map_elements` function and a counter example where we use the expression API to -achieve the same goals. +In particular, Numba provides a decorator called [`@guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator). +This creates a generalized ufunc by compiling a Python function to fast machine code, in a way that allows it to be used by Polars. -### Adding a counter +In the following example the `diff_from_mean_numba()` will be compiled to fast machine code at import time, which will take a little time. +After that all calls to the function will run quickly. +The `Series` will be converted to a NumPy array before being passed to the function: -In this example we create a global `counter` and then add the integer `1` to the global state at every element processed. -Every iteration the result of the increment will be added to the element value. - -> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this `apply` is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. - -{{code_block('user-guide/expressions/user-defined-functions','counter',[])}} +{{code_block('user-guide/expressions/user-defined-functions','diff_from_mean_numba',[])}} ```python exec="on" result="text" session="user-guide/udf" ---8<-- "python/user-guide/expressions/user-defined-functions.py:counter" +--8<-- "python/user-guide/expressions/user-defined-functions.py:diff_from_mean_numba" ``` -### Combining multiple column values +## Missing data is not allowed when calling generalized ufuncs -If we want to have access to values of different columns in a single `map_elements` function call, we can create `struct` data -type. This data type collects those columns as fields in the `struct`. So if we'd create a struct from the columns -`"keys"` and `"values"`, we would get the following struct elements: +Before being passed to a user-defined function like `diff_from_mean_numba()`, a `Series` will be converted to a NumPy array. +Unfortunately, NumPy arrays don't have a concept of missing data. +If there is missing data in the original `Series`, this means the resulting array won't actually match the `Series`. -```python -[ - {"keys": "a", "values": 10}, - {"keys": "a", "values": 7}, - {"keys": "b", "values": 1}, -] -``` +If you're calculating results item by item, this doesn't matter. +For example, `numpy.log()` gets called on each individual value separately, so those missing values don't change the calculation. +But if the result of a user-defined function depend on multiple values in the `Series`, it's not clear what exactly should happen with the missing values. -In Python, those would be passed as `dict` to the calling Python function and can thus be indexed by `field: str`. In Rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. +Therefore, when calling generalized ufuncs such as Numba functions decorated with `@guvectorize`, Polars will raise an error if you try to pass in a `Series` with missing data. +How do you get rid of missing data? +Either [fill it in](missing-data.md) or [drop it](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.drop_nulls.html) before calling your custom function. + +## Combining multiple column values + +If you want to pass multiple columns to a user-defined function, you can use `Struct`s, which are [covered in detail in a different section](structs.md). +The basic idea is to combine multiple columns into a `Struct`, and then the function can extract the columns back out: {{code_block('user-guide/expressions/user-defined-functions','combine',[])}} @@ -145,17 +113,22 @@ In Python, those would be passed as `dict` to the calling Python function and ca --8<-- "python/user-guide/expressions/user-defined-functions.py:combine" ``` -`Structs` are covered in detail in the next section. +## Streaming calculations -### Return types? +Passing the full `Series` to the user-defined function has a cost: it may use a lot of memory, as its contents are copied into a NumPy array. +You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. -Custom Python functions are black boxes for Polars. We really don't know what kind of black arts you are doing, so we have -to infer and try our best to understand what you meant. +!!! note +The `is_elementwise` argument can lead to incorrect results if set incorrectly. +If you set `is_elementwise=True`, make sure that your function actually operates +element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, +for instance, does not. -As a user it helps to understand what we do to better utilize custom functions. +## Return types -The data type is automatically inferred. We do that by waiting for the first non-null value. That value will then be used -to determine the type of the `Series`. +Custom Python functions are often black boxes; Polars doesn't know what your function is doing or what it will return. +The return data type is therefore automatically inferred. We do that by waiting for the first non-null value. That value will then be used +to determine the type of the resulting `Series`. The mapping of Python types to Polars data types is as follows: @@ -174,3 +147,5 @@ Rust types map as follows: - `bool` -> `Boolean` - `String` or `str` -> `String` - `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) + +You can pass a `return_dtype` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) if you want to override the inferred type. diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index b4e4ac4ee63..49e661031b8 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -23,6 +23,7 @@ numba; python_version < '3.13' # Numba can lag Python releases pandas pyarrow pydantic>=2.0.0 +numba # Datetime / time zones backports.zoneinfo; python_version < '3.9' tzdata; platform_system == 'Windows' diff --git a/py-polars/tests/docs/test_user_guide.py b/py-polars/tests/docs/test_user_guide.py index 08be6fe9dfb..a513f4b5f0c 100644 --- a/py-polars/tests/docs/test_user_guide.py +++ b/py-polars/tests/docs/test_user_guide.py @@ -32,5 +32,8 @@ def _change_test_dir() -> Iterator[None]: @pytest.mark.docs() @pytest.mark.parametrize("path", snippet_paths) @pytest.mark.usefixtures("_change_test_dir") +@pytest.mark.filterwarnings( + r"ignore:\nExpr\.map_elements:polars.exceptions.PolarsInefficientMapWarning" +) def test_run_python_snippets(path: Path) -> None: runpy.run_path(str(path))