docs: null handling (#1624)

narwhals-dev · Dec 20, 2024 · 51bf2d0 · 51bf2d0
1 parent 453d6a6
commit 51bf2d0
Show file tree

Hide file tree

Showing 5 changed files with 281 additions and 167 deletions.
diff --git a/docs/pandas_like_concepts/null_handling.md b/docs/pandas_like_concepts/null_handling.md
@@ -0,0 +1,45 @@
+# Null/NaN handling
+
+pandas doesn't distinguish between Null and NaN values as Polars and PyArrow do.
+
+Depending on the data type of the underlying data structure, `np.nan`, `pd.NaT`, `None` and `pd.NA` all encode missing data in pandas.
+
+Polars and PyArrow, instead, treat `NaN` as a valid floating point value which is rare to encounter and more often produced as the result of a computation than explicitly set during data initialization; they treat `null` as the missing data indicator, regardless of the data type.
+
+In Narwhals, then, `is_null` behaves differently across backends (and so do `drop_nulls`, `fill_null` and `null_count`):
+
+```python exec="1" source="above" session="null_handling"
+import narwhals as nw
+import numpy as np
+from narwhals.typing import IntoFrameT
+
+data = {"a": [1.4, float("nan"), np.nan, 4.2, None]}
+
+
+def check_null_behavior(df: IntoFrameT) -> IntoFrameT:
+    return nw.from_native(df).with_columns(a_is_null=nw.col("a").is_null()).to_native()
+```
+
+=== "pandas"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import pandas as pd
+
+    df = pd.DataFrame(data)
+    print(check_null_behavior(df))
+    ```
+
+=== "Polars (eager)"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import polars as pl
+
+    df = pl.DataFrame(data)
+    print(check_null_behavior(df))
+    ```
+
+=== "PyArrow"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import pyarrow as pa
+
+    df = pa.table(data)
+    print(check_null_behavior(df))
+    ```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -16,6 +16,7 @@ nav:
     - pandas_like_concepts/user_warning.md
     - pandas_like_concepts/column_names.md
     - pandas_like_concepts/boolean.md
+    - pandas_like_concepts/null_handling.md
   - Overhead: overhead.md
   - Perfect backwards compatibility policy: backcompat.md
   - Supported libraries and extending Narwhals: extending.md

diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py
@@ -1196,26 +1196,29 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self:
             The original object with the rows removed that contained the null values.
 
         Notes:
-            pandas and Polars handle null values differently. Polars distinguishes
-            between NaN and Null, whereas pandas doesn't.
+            pandas handles null values differently from Polars and PyArrow.
+            See [null_handling](../../pandas_like_concepts/null_handling)
+            for reference.
 
         Examples:
             >>> import polars as pl
             >>> import pandas as pd
+            >>> import pyarrow as pa
             >>> import narwhals as nw
             >>> from narwhals.typing import IntoFrameT
             >>>
             >>> data = {"a": [1.0, 2.0, None], "ba": [1.0, None, 2.0]}
             >>> df_pd = pd.DataFrame(data)
             >>> df_pl = pl.DataFrame(data)
+            >>> df_pa = pa.table(data)
 
             Let's define a dataframe-agnostic function:
 
             >>> def agnostic_drop_nulls(df_native: IntoFrameT) -> IntoFrameT:
             ...     df = nw.from_native(df_native)
             ...     return df.drop_nulls().to_native()
 
-            We can then pass either pandas or Polars:
+            We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`:
 
             >>> agnostic_drop_nulls(df_pd)
                  a   ba
@@ -1229,6 +1232,13 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self:
             ╞═════╪═════╡
             │ 1.0 ┆ 1.0 │
             └─────┴─────┘
+            >>> agnostic_drop_nulls(df_pa)
+            pyarrow.Table
+            a: double
+            ba: double
+            ----
+            a: [[1]]
+            ba: [[1]]
         """
         return super().drop_nulls(subset=subset)
 
@@ -2666,42 +2676,39 @@ def null_count(self: Self) -> Self:
             A dataframe of shape (1, n_columns).
 
         Notes:
-            pandas and Polars handle null values differently. Polars distinguishes
-            between NaN and Null, whereas pandas doesn't.
+            pandas handles null values differently from Polars and PyArrow.
+            See [null_handling](../../pandas_like_concepts/null_handling)
+            for reference.
 
         Examples:
             >>> import narwhals as nw
+            >>> from narwhals.typing import IntoFrameT
             >>> import pandas as pd
             >>> import polars as pl
-            >>> df_pd = pd.DataFrame(
-            ...     {
-            ...         "foo": [1, None, 3],
-            ...         "bar": [6, 7, None],
-            ...         "ham": ["a", "b", "c"],
-            ...     }
-            ... )
-            >>> df_pl = pl.DataFrame(
-            ...     {
-            ...         "foo": [1, None, 3],
-            ...         "bar": [6, 7, None],
-            ...         "ham": ["a", "b", "c"],
-            ...     }
-            ... )
+            >>> import pyarrow as pa
+            >>> data = {
+            ...     "foo": [1, None, 3],
+            ...     "bar": [6, 7, None],
+            ...     "ham": ["a", "b", "c"],
+            ... }
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+            >>> df_pa = pa.table(data)
 
             Let's define a dataframe-agnostic function that returns the null count of
             each columns:
 
-            >>> @nw.narwhalify
-            ... def func(df):
-            ...     return df.null_count()
+            >>> def agnostic_null_count(df_native: IntoFrameT) -> IntoFrameT:
+            ...     df = nw.from_native(df_native)
+            ...     return df.null_count().to_native()
 
-            We can then pass either pandas or Polars to `func`:
+            We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_null_count`:
 
-            >>> func(df_pd)
+            >>> agnostic_null_count(df_pd)
                foo  bar  ham
             0    1    1    0
 
-            >>> func(df_pl)
+            >>> agnostic_null_count(df_pl)
             shape: (1, 3)
             ┌─────┬─────┬─────┐
             │ foo ┆ bar ┆ ham │
@@ -2710,6 +2717,16 @@ def null_count(self: Self) -> Self:
             ╞═════╪═════╪═════╡
             │ 1   ┆ 1   ┆ 0   │
             └─────┴─────┴─────┘
+
+            >>> agnostic_null_count(df_pa)
+            pyarrow.Table
+            foo: int64
+            bar: int64
+            ham: int64
+            ----
+            foo: [[1]]
+            bar: [[1]]
+            ham: [[0]]
         """
         return self._from_compliant_dataframe(self._compliant_frame.null_count())
 
@@ -3309,8 +3326,9 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self:
                 (default), use all columns.
 
         Notes:
-            pandas and Polars handle null values differently. Polars distinguishes
-            between NaN and Null, whereas pandas doesn't.
+            pandas handles null values differently from Polars and PyArrow.
+            See [null_handling](../../pandas_like_concepts/null_handling)
+            for reference.
 
         Examples:
             >>> import polars as pl
@@ -3328,7 +3346,7 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self:
             ...     df = nw.from_native(df_native)
             ...     return df.drop_nulls().to_native()
 
-            We can then pass either pandas or Polars:
+            We can then pass any supported library such as Pandas or Polars to `agnostic_drop_nulls`:
 
             >>> agnostic_drop_nulls(df_pd)
                  a   ba