From 51bf2d0ae0c3d02b7f8b8cc657244a91d208a58f Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Fri, 20 Dec 2024 12:58:05 +0100 Subject: [PATCH] docs: null handling (#1624) --- docs/pandas_like_concepts/null_handling.md | 45 ++++++ mkdocs.yml | 1 + narwhals/dataframe.py | 74 +++++---- narwhals/expr.py | 153 +++++++++--------- narwhals/series.py | 175 +++++++++++++-------- 5 files changed, 281 insertions(+), 167 deletions(-) create mode 100644 docs/pandas_like_concepts/null_handling.md diff --git a/docs/pandas_like_concepts/null_handling.md b/docs/pandas_like_concepts/null_handling.md new file mode 100644 index 000000000..404b50304 --- /dev/null +++ b/docs/pandas_like_concepts/null_handling.md @@ -0,0 +1,45 @@ +# Null/NaN handling + +pandas doesn't distinguish between Null and NaN values as Polars and PyArrow do. + +Depending on the data type of the underlying data structure, `np.nan`, `pd.NaT`, `None` and `pd.NA` all encode missing data in pandas. + +Polars and PyArrow, instead, treat `NaN` as a valid floating point value which is rare to encounter and more often produced as the result of a computation than explicitly set during data initialization; they treat `null` as the missing data indicator, regardless of the data type. + +In Narwhals, then, `is_null` behaves differently across backends (and so do `drop_nulls`, `fill_null` and `null_count`): + +```python exec="1" source="above" session="null_handling" +import narwhals as nw +import numpy as np +from narwhals.typing import IntoFrameT + +data = {"a": [1.4, float("nan"), np.nan, 4.2, None]} + + +def check_null_behavior(df: IntoFrameT) -> IntoFrameT: + return nw.from_native(df).with_columns(a_is_null=nw.col("a").is_null()).to_native() +``` + +=== "pandas" + ```python exec="true" source="material-block" result="python" session="null_handling" + import pandas as pd + + df = pd.DataFrame(data) + print(check_null_behavior(df)) + ``` + +=== "Polars (eager)" + ```python exec="true" source="material-block" result="python" session="null_handling" + import polars as pl + + df = pl.DataFrame(data) + print(check_null_behavior(df)) + ``` + +=== "PyArrow" + ```python exec="true" source="material-block" result="python" session="null_handling" + import pyarrow as pa + + df = pa.table(data) + print(check_null_behavior(df)) + ``` diff --git a/mkdocs.yml b/mkdocs.yml index 5bbbd12d8..4ec3b5710 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,6 +16,7 @@ nav: - pandas_like_concepts/user_warning.md - pandas_like_concepts/column_names.md - pandas_like_concepts/boolean.md + - pandas_like_concepts/null_handling.md - Overhead: overhead.md - Perfect backwards compatibility policy: backcompat.md - Supported libraries and extending Narwhals: extending.md diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 634f676f3..13c13ff14 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -1196,18 +1196,21 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: The original object with the rows removed that contained the null values. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import polars as pl >>> import pandas as pd + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1.0, 2.0, None], "ba": [1.0, None, 2.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1215,7 +1218,7 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: ... df = nw.from_native(df_native) ... return df.drop_nulls().to_native() - We can then pass either pandas or Polars: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: >>> agnostic_drop_nulls(df_pd) a ba @@ -1229,6 +1232,13 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: ╞═════╪═════╡ │ 1.0 ┆ 1.0 │ └─────┴─────┘ + >>> agnostic_drop_nulls(df_pa) + pyarrow.Table + a: double + ba: double + ---- + a: [[1]] + ba: [[1]] """ return super().drop_nulls(subset=subset) @@ -2666,42 +2676,39 @@ def null_count(self: Self) -> Self: A dataframe of shape (1, n_columns). Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl - >>> df_pd = pd.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "foo": [1, None, 3], - ... "bar": [6, 7, None], - ... "ham": ["a", "b", "c"], - ... } - ... ) + >>> import pyarrow as pa + >>> data = { + ... "foo": [1, None, 3], + ... "bar": [6, 7, None], + ... "ham": ["a", "b", "c"], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function that returns the null count of each columns: - >>> @nw.narwhalify - ... def func(df): - ... return df.null_count() + >>> def agnostic_null_count(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.null_count().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_null_count`: - >>> func(df_pd) + >>> agnostic_null_count(df_pd) foo bar ham 0 1 1 0 - >>> func(df_pl) + >>> agnostic_null_count(df_pl) shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -2710,6 +2717,16 @@ def null_count(self: Self) -> Self: ╞═════╪═════╪═════╡ │ 1 ┆ 1 ┆ 0 │ └─────┴─────┴─────┘ + + >>> agnostic_null_count(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: int64 + ---- + foo: [[1]] + bar: [[1]] + ham: [[0]] """ return self._from_compliant_dataframe(self._compliant_frame.null_count()) @@ -3309,8 +3326,9 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: (default), use all columns. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import polars as pl @@ -3328,7 +3346,7 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: ... df = nw.from_native(df_native) ... return df.drop_nulls().to_native() - We can then pass either pandas or Polars: + We can then pass any supported library such as Pandas or Polars to `agnostic_drop_nulls`: >>> agnostic_drop_nulls(df_pd) a ba diff --git a/narwhals/expr.py b/narwhals/expr.py index 24ddd3f40..013b79959 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1856,8 +1856,9 @@ def is_null(self) -> Self: A new expression. Notes: - pandas, Polars and PyArrow handle null values differently. Polars and PyArrow - distinguish between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import pandas as pd @@ -1869,23 +1870,21 @@ def is_null(self) -> Self: ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} ... ) >>> df_pl = pl.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - ... ) - >>> df_pa = pa.table( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} + ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, None, 3.0, 5.0]} ... ) + >>> df_pa = pa.table({"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, None, 3.0, 5.0]}) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_null(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_is_null=nw.col("a").is_null(), b_is_null=nw.col("b").is_null() ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_null`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_null(df_pd) a b a_is_null b_is_null 0 2.0 2.0 False False 1 4.0 4.0 False False @@ -1893,21 +1892,21 @@ def is_null(self) -> Self: 3 3.0 3.0 False False 4 5.0 5.0 False False - >>> my_library_agnostic_function(df_pl) # nan != null for polars + >>> agnostic_is_null(df_pl) shape: (5, 4) - ┌──────┬─────┬───────────┬───────────┐ - │ a ┆ b ┆ a_is_null ┆ b_is_null │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ bool │ - ╞══════╪═════╪═══════════╪═══════════╡ - │ 2 ┆ 2.0 ┆ false ┆ false │ - │ 4 ┆ 4.0 ┆ false ┆ false │ - │ null ┆ NaN ┆ true ┆ false │ - │ 3 ┆ 3.0 ┆ false ┆ false │ - │ 5 ┆ 5.0 ┆ false ┆ false │ - └──────┴─────┴───────────┴───────────┘ - - >>> my_library_agnostic_function(df_pa) # nan != null for pyarrow + ┌──────┬──────┬───────────┬───────────┐ + │ a ┆ b ┆ a_is_null ┆ b_is_null │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ bool ┆ bool │ + ╞══════╪══════╪═══════════╪═══════════╡ + │ 2 ┆ 2.0 ┆ false ┆ false │ + │ 4 ┆ 4.0 ┆ false ┆ false │ + │ null ┆ null ┆ true ┆ true │ + │ 3 ┆ 3.0 ┆ false ┆ false │ + │ 5 ┆ 5.0 ┆ false ┆ false │ + └──────┴──────┴───────────┴───────────┘ + + >>> agnostic_is_null(df_pa) pyarrow.Table a: int64 b: double @@ -1915,9 +1914,9 @@ def is_null(self) -> Self: b_is_null: bool ---- a: [[2,4,null,3,5]] - b: [[2,4,nan,3,5]] + b: [[2,4,null,3,5]] a_is_null: [[false,false,true,false,false]] - b_is_null: [[false,false,false,false,false]] + b_is_null: [[false,false,true,false,false]] """ return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null()) @@ -1985,8 +1984,9 @@ def fill_null( A new expression. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import pandas as pd @@ -2003,25 +2003,25 @@ def fill_null( >>> df_pl = pl.DataFrame( ... { ... "a": [2, 4, None, None, 3, 5], - ... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0], + ... "b": [2.0, 4.0, None, None, 3.0, 5.0], ... } ... ) >>> df_pa = pa.table( ... { ... "a": [2, 4, None, None, 3, 5], - ... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0], + ... "b": [2.0, 4.0, None, None, 3.0, 5.0], ... } ... ) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_fill_null(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(nw.col("a", "b").fill_null(0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_fill_null`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_fill_null(df_pd) a b 0 2.0 2.0 1 4.0 4.0 @@ -2030,7 +2030,7 @@ def fill_null( 4 3.0 3.0 5 5.0 5.0 - >>> my_library_agnostic_function(df_pl) # nan != null for polars + >>> agnostic_fill_null(df_pl) shape: (6, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2039,23 +2039,23 @@ def fill_null( ╞═════╪═════╡ │ 2 ┆ 2.0 │ │ 4 ┆ 4.0 │ - │ 0 ┆ NaN │ - │ 0 ┆ NaN │ + │ 0 ┆ 0.0 │ + │ 0 ┆ 0.0 │ │ 3 ┆ 3.0 │ │ 5 ┆ 5.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) # nan != null for pyarrow + >>> agnostic_fill_null(df_pa) pyarrow.Table a: int64 b: double ---- a: [[2,4,0,0,3,5]] - b: [[2,4,nan,nan,3,5]] + b: [[2,4,0,0,3,5]] Using a strategy: - >>> def func_strategies(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_fill_null_with_strategy(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a", "b") @@ -2063,7 +2063,7 @@ def fill_null( ... .name.suffix("_filled") ... ).to_native() - >>> func_strategies(df_pd) + >>> agnostic_fill_null_with_strategy(df_pd) a b a_filled b_filled 0 2.0 2.0 2.0 2.0 1 4.0 4.0 4.0 4.0 @@ -2072,22 +2072,22 @@ def fill_null( 4 3.0 3.0 3.0 3.0 5 5.0 5.0 5.0 5.0 - >>> func_strategies(df_pl) # nan != null for polars + >>> agnostic_fill_null_with_strategy(df_pl) shape: (6, 4) - ┌──────┬─────┬──────────┬──────────┐ - │ a ┆ b ┆ a_filled ┆ b_filled │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ i64 ┆ f64 │ - ╞══════╪═════╪══════════╪══════════╡ - │ 2 ┆ 2.0 ┆ 2 ┆ 2.0 │ - │ 4 ┆ 4.0 ┆ 4 ┆ 4.0 │ - │ null ┆ NaN ┆ 4 ┆ NaN │ - │ null ┆ NaN ┆ null ┆ NaN │ - │ 3 ┆ 3.0 ┆ 3 ┆ 3.0 │ - │ 5 ┆ 5.0 ┆ 5 ┆ 5.0 │ - └──────┴─────┴──────────┴──────────┘ - - >>> func_strategies(df_pa) # nan != null for pyarrow + ┌──────┬──────┬──────────┬──────────┐ + │ a ┆ b ┆ a_filled ┆ b_filled │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ i64 ┆ f64 │ + ╞══════╪══════╪══════════╪══════════╡ + │ 2 ┆ 2.0 ┆ 2 ┆ 2.0 │ + │ 4 ┆ 4.0 ┆ 4 ┆ 4.0 │ + │ null ┆ null ┆ 4 ┆ 4.0 │ + │ null ┆ null ┆ null ┆ null │ + │ 3 ┆ 3.0 ┆ 3 ┆ 3.0 │ + │ 5 ┆ 5.0 ┆ 5 ┆ 5.0 │ + └──────┴──────┴──────────┴──────────┘ + + >>> agnostic_fill_null_with_strategy(df_pa) pyarrow.Table a: int64 b: double @@ -2095,9 +2095,9 @@ def fill_null( b_filled: double ---- a: [[2,4,null,null,3,5]] - b: [[2,4,nan,nan,3,5]] + b: [[2,4,null,null,3,5]] a_filled: [[2,4,4,null,3,5]] - b_filled: [[2,4,nan,nan,3,5]] + b_filled: [[2,4,4,null,3,5]] """ if value is not None and strategy is not None: msg = "cannot specify both `value` and `strategy`" @@ -2116,14 +2116,15 @@ def fill_null( # --- partial reduction --- def drop_nulls(self) -> Self: - """Remove missing values. + """Drop null values. Returns: A new expression. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import narwhals as nw @@ -2133,25 +2134,25 @@ def drop_nulls(self) -> Self: >>> import pyarrow as pa >>> df_pd = pd.DataFrame({"a": [2.0, 4.0, float("nan"), 3.0, None, 5.0]}) - >>> df_pl = pl.DataFrame({"a": [2.0, 4.0, float("nan"), 3.0, None, 5.0]}) - >>> df_pa = pa.table({"a": [2.0, 4.0, float("nan"), 3.0, None, 5.0]}) + >>> df_pl = pl.DataFrame({"a": [2.0, 4.0, None, 3.0, None, 5.0]}) + >>> df_pa = pa.table({"a": [2.0, 4.0, None, 3.0, None, 5.0]}) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_drop_nulls(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").drop_nulls()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_drop_nulls(df_pd) a 0 2.0 1 4.0 3 3.0 5 5.0 - >>> my_library_agnostic_function(df_pl) # nan != null for polars - shape: (5, 1) + >>> agnostic_drop_nulls(df_pl) + shape: (4, 1) ┌─────┐ │ a │ │ --- │ @@ -2159,15 +2160,14 @@ def drop_nulls(self) -> Self: ╞═════╡ │ 2.0 │ │ 4.0 │ - │ NaN │ │ 3.0 │ │ 5.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) # nan != null for pyarrow + >>> agnostic_drop_nulls(df_pa) pyarrow.Table a: double ---- - a: [[2,4,nan,3,5]] + a: [[2,4,3,5]] """ return self.__class__(lambda plx: self._to_compliant_expr(plx).drop_nulls()) @@ -2438,8 +2438,9 @@ def null_count(self) -> Self: A new expression. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import narwhals as nw @@ -2454,16 +2455,16 @@ def null_count(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_null_count(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().null_count()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_null_count`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_null_count(df_pd) a b 0 1 2 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_null_count(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2472,7 +2473,7 @@ def null_count(self) -> Self: ╞═════╪═════╡ │ 1 ┆ 2 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_null_count(df_pa) pyarrow.Table a: int64 b: int64 diff --git a/narwhals/series.py b/narwhals/series.py index 98baab296..71cc8062e 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1278,44 +1278,56 @@ def arg_true(self) -> Self: return self._from_compliant_series(self._compliant_series.arg_true()) def drop_nulls(self) -> Self: - """Drop all null values. + """Drop null values. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: - >>> import pandas as pd - >>> import polars as pl - >>> import numpy as np - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT - >>> s_pd = pd.Series([2, 4, None, 3, 5]) - >>> s_pl = pl.Series("a", [2, 4, None, 3, 5]) - - Now define a dataframe-agnostic function with a `column` argument for the column to evaluate : - - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: - ... s = nw.from_native(s_native, series_only=True) - ... return s.drop_nulls().to_native() - - Then we can pass either Series (polars or pandas) to `func`: - - >>> my_library_agnostic_function(s_pd) - 0 2.0 - 1 4.0 - 3 3.0 - 4 5.0 - dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) - Series: 'a' [i64] - [ - 2 - 4 - 3 - 5 - ] + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> s_pd = pd.Series([2, 4, None, 3, 5]) + >>> s_pl = pl.Series([2, 4, None, 3, 5]) + >>> s_pa = pa.chunked_array([[2, 4, None, 3, 5]]) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_drop_nulls(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.drop_nulls().to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: + + >>> agnostic_drop_nulls(s_pd) + 0 2.0 + 1 4.0 + 3 3.0 + 4 5.0 + dtype: float64 + >>> agnostic_drop_nulls(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [i64] + [ + 2 + 4 + 3 + 5 + ] + >>> agnostic_drop_nulls(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 4, + 3, + 5 + ] + ] """ return self._from_compliant_series(self._compliant_series.drop_nulls()) @@ -1879,32 +1891,35 @@ def is_null(self) -> Self: """Returns a boolean Series indicating which values are null. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw + >>> import pyarrow as pa >>> from narwhals.typing import IntoSeriesT >>> s = [1, 2, None] >>> s_pd = pd.Series(s) >>> s_pl = pl.Series(s) + >>> s_pa = pa.chunked_array([s]) - We define a dataframe-agnostic function: + Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_null(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_null().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_null`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_is_null(s_pd) 0 False 1 False 2 True dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_null(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] [ @@ -1912,6 +1927,15 @@ def is_null(self) -> Self: false true ] + >>> agnostic_is_null(s_pa) # doctest:+ELLIPSIS + + [ + [ + false, + false, + true + ] + ] """ return self._from_compliant_series(self._compliant_series.is_null()) @@ -1925,38 +1949,39 @@ def fill_null( Arguments: value: Value used to fill null values. - strategy: Strategy used to fill null values. - limit: Number of consecutive null values to fill when using the 'forward' or 'backward' strategy. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT >>> s = [1, 2, None] >>> s_pd = pd.Series(s) >>> s_pl = pl.Series(s) + >>> s_pa = pa.chunked_array([s]) - We define a dataframe-agnostic function: + Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_fill_null(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.fill_null(5).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_fill_null`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_fill_null(s_pd) 0 1.0 1 2.0 2 5.0 dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_fill_null(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1964,20 +1989,28 @@ def fill_null( 2 5 ] + >>> agnostic_fill_null(s_pa) # doctest:+ELLIPSIS + + [ + [ + 1, + 2, + 5 + ] + ] Using a strategy: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_fill_null_with_strategy(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.fill_null(strategy="forward", limit=1).to_native() - >>> my_library_agnostic_function(s_pd) + >>> agnostic_fill_null_with_strategy(s_pd) 0 1.0 1 2.0 2 2.0 dtype: float64 - - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_fill_null_with_strategy(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1985,6 +2018,15 @@ def fill_null( 2 2 ] + >>> agnostic_fill_null_with_strategy(s_pa) # doctest:+ELLIPSIS + + [ + [ + 1, + 2, + 2 + ] + ] """ if value is not None and strategy is not None: msg = "cannot specify both `value` and `strategy`" @@ -2416,28 +2458,35 @@ def null_count(self: Self) -> int: r"""Create a new Series that shows the null counts per column. Notes: - pandas and Polars handle null values differently. Polars distinguishes - between NaN and Null, whereas pandas doesn't. + pandas handles null values differently from Polars and PyArrow. + See [null_handling](../../pandas_like_concepts/null_handling) + for reference. Examples: >>> import narwhals as nw >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, None, 3]) - >>> s_pl = pl.Series([1, None, None]) + >>> import pyarrow as pa + >>> s = [1, None, None] + >>> s_pd = pd.Series(s) + >>> s_pl = pl.Series(s) + >>> s_pa = pa.chunked_array([s]) Let's define a dataframe-agnostic function that returns the null count of the series: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_null_count(s_native: IntoSeries): ... s = nw.from_native(s_native, series_only=True) ... return s.null_count() - We can then pass either pandas or Polars to `func`: - >>> my_library_agnostic_function(s_pd) - np.int64(1) - >>> my_library_agnostic_function(s_pl) + We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_null_count`: + + >>> agnostic_null_count(s_pd) + np.int64(2) + >>> agnostic_null_count(s_pl) + 2 + >>> agnostic_null_count(s_pa) 2 """ return self._compliant_series.null_count() # type: ignore[no-any-return]