From a06885362ce8d9b89441f4565a32d95ea6eaf100 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 16 Oct 2023 17:53:59 +0200
Subject: [PATCH 1/7] add make_series and make_dataframe

---
 skrub/dataframe/_pandas.py           | 31 +++++++++++++++++
 skrub/dataframe/_polars.py           | 50 ++++++++++++++++++++++++++++
 skrub/dataframe/tests/test_pandas.py | 20 +++++++++--
 skrub/dataframe/tests/test_polars.py | 21 ++++++++++--
 4 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py
index f6aae26cd..69f9c306d 100644
--- a/skrub/dataframe/_pandas.py
+++ b/skrub/dataframe/_pandas.py
@@ -12,6 +12,37 @@
 from skrub._utils import atleast_1d_or_none
 
 
+def make_dataframe(X, index):
+    if not isinstance(X, dict) or not all(
+        (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1)
+        for X_col in X.values()
+    ):
+        raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.")
+    return pd.DataFrame(X, index=index)
+
+
+def make_series(X, index=None, name=None):
+    """Convert an 1d array into a Pandas series.
+
+    Parameters
+    ----------
+    X : 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        The index of the series.
+
+    name : str, default=None
+        The name of the series.
+
+    Returns
+    -------
+    X : Pandas series
+        Converted output.
+    """
+    return pd.Series(X, index=index, name=name)
+
+
 def aggregate(
     table: pd.DataFrame,
     key: str | Iterable[str],
diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py
index a10546b7e..427aca15c 100644
--- a/skrub/dataframe/_polars.py
+++ b/skrub/dataframe/_polars.py
@@ -3,6 +3,8 @@
 """
 from typing import Iterable
 
+import numpy as np
+
 from skrub.dataframe._types import POLARS_SETUP, DataFrameLike
 
 if POLARS_SETUP:
@@ -14,6 +16,54 @@
 from skrub._utils import atleast_1d_or_none
 
 
+def make_dataframe(X, index=None):
+    """Convert an dictionary of columns into a Polars dataframe.
+
+    Parameters
+    ----------
+    X : mapping from column name to 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        Unused since polars doesn't use index.
+        Only here for compatibility with Pandas.
+
+    Returns
+    -------
+    X : Polars dataframe
+        Converted output.
+    """
+    if not isinstance(X, dict) or not all(
+        (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1)
+        for X_col in X.values()
+    ):
+        raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.")
+    return pl.DataFrame(X)
+
+
+def make_series(X, index=None, name=None):
+    """Convert an 1d array into a Polars series.
+
+    Parameters
+    ----------
+    X : 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        Unused since polars doesn't use index.
+        Only here for compatibility with Pandas.
+
+    name : str, default=None
+        The name of the series.
+
+    Returns
+    -------
+    X : Polars series
+        Converted output.
+    """
+    return pl.Series(values=X, name=name)
+
+
 def aggregate(
     table: DataFrameLike,
     key: str | Iterable[str],
diff --git a/skrub/dataframe/tests/test_pandas.py b/skrub/dataframe/tests/test_pandas.py
index 067788540..1453dd230 100644
--- a/skrub/dataframe/tests/test_pandas.py
+++ b/skrub/dataframe/tests/test_pandas.py
@@ -1,8 +1,8 @@
 import pandas as pd
 import pytest
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
-from skrub.dataframe._pandas import aggregate, join
+from skrub.dataframe._pandas import aggregate, join, make_dataframe, make_series
 
 main = pd.DataFrame(
     {
@@ -95,3 +95,19 @@ def test_no_agg_operation():
             num_operations=None,
             categ_operations=None,
         )
+
+
+def test_make_dataframe():
+    X = dict(a=[1, 2], b=["z", "e"])
+    expected_df = pd.DataFrame(dict(a=[1, 2], b=["z", "e"]))
+    assert_frame_equal(make_dataframe(X, index=[0, 1]), expected_df)
+
+    X = [[1, 2], ["z", "e"]]
+    with pytest.raises(TypeError):
+        make_dataframe(X)
+
+
+def test_make_series():
+    X = [1, 2, 3]
+    expected_series = pd.Series(X)
+    assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series)
diff --git a/skrub/dataframe/tests/test_polars.py b/skrub/dataframe/tests/test_polars.py
index 49ccee1d0..f982f72f0 100644
--- a/skrub/dataframe/tests/test_polars.py
+++ b/skrub/dataframe/tests/test_polars.py
@@ -2,11 +2,11 @@
 import pytest
 
 from skrub.dataframe import POLARS_SETUP
-from skrub.dataframe._polars import aggregate, join
+from skrub.dataframe._polars import aggregate, join, make_dataframe, make_series
 
 if POLARS_SETUP:
     import polars as pl
-    from polars.testing import assert_frame_equal
+    from polars.testing import assert_frame_equal, assert_series_equal
 
     main = pl.DataFrame(
         {
@@ -71,3 +71,20 @@ def test_incorrect_dataframe_inputs():
             cols_to_agg="rating",
             num_operations="mean",
         )
+
+
+def test_make_dataframe():
+    X = dict(a=[1, 2], b=["z", "e"])
+    expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"]))
+    assert_frame_equal(make_dataframe(X, index=[1, 2]), expected_df)
+
+    X = [[1, 2], ["z", "e"]]
+    with pytest.raises(TypeError):
+        make_dataframe(X)
+
+
+@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG)
+def test_make_series():
+    X = [1, 2, 3]
+    expected_series = pl.Series(X)
+    assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series)

From 7e1424da39d79fda91ee2dea30249fc379b27e92 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 16 Oct 2023 17:59:51 +0200
Subject: [PATCH 2/7] forgot make_dataframe for pandas docstring

---
 skrub/dataframe/_pandas.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py
index 69f9c306d..d549e8104 100644
--- a/skrub/dataframe/_pandas.py
+++ b/skrub/dataframe/_pandas.py
@@ -13,6 +13,21 @@
 
 
 def make_dataframe(X, index):
+    """Convert an dictionary of columns into a Pandas dataframe.
+
+    Parameters
+    ----------
+    X : mapping from column name to 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        The index of the dataframe.
+
+    Returns
+    -------
+    X : Pandas dataframe
+        Converted output.
+    """
     if not isinstance(X, dict) or not all(
         (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1)
         for X_col in X.values()

From e5e00d0b9ee67752ef489777f243c1ffe8ee0741 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 31 Oct 2023 16:43:41 +0100
Subject: [PATCH 3/7] apply suggestions

---
 skrub/dataframe/_pandas.py           |  2 +-
 skrub/dataframe/_polars.py           | 10 ++++++++++
 skrub/dataframe/tests/test_polars.py | 20 +++++++++++---------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py
index d549e8104..b3be557ec 100644
--- a/skrub/dataframe/_pandas.py
+++ b/skrub/dataframe/_pandas.py
@@ -12,7 +12,7 @@
 from skrub._utils import atleast_1d_or_none
 
 
-def make_dataframe(X, index):
+def make_dataframe(X, index=None):
     """Convert an dictionary of columns into a Pandas dataframe.
 
     Parameters
diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py
index 427aca15c..bf4e30f7f 100644
--- a/skrub/dataframe/_polars.py
+++ b/skrub/dataframe/_polars.py
@@ -33,6 +33,11 @@ def make_dataframe(X, index=None):
     X : Polars dataframe
         Converted output.
     """
+    if index is not None:
+        raise ValueError(
+            "Polars dataframes don't have an index, but "
+            f"the Polars dataframe maker was called with {index=!r}."
+        )
     if not isinstance(X, dict) or not all(
         (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1)
         for X_col in X.values()
@@ -61,6 +66,11 @@ def make_series(X, index=None, name=None):
     X : Polars series
         Converted output.
     """
+    if index is not None:
+        raise ValueError(
+            "Polars series don't have an index, but "
+            f"the Polars series maker was called with {index=!r}."
+        )
     return pl.Series(values=X, name=name)
 
 
diff --git a/skrub/dataframe/tests/test_polars.py b/skrub/dataframe/tests/test_polars.py
index f982f72f0..c9f8bc4bd 100644
--- a/skrub/dataframe/tests/test_polars.py
+++ b/skrub/dataframe/tests/test_polars.py
@@ -16,18 +16,17 @@
             "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
         }
     )
+else:
+    polars_missing_msg = "Polars is not available"
+    pytest.skip(reason=polars_missing_msg, allow_module_level=True)
 
-POLARS_MISSING_MSG = "Polars is not available"
 
-
-@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG)
 def test_join():
     joined = join(left=main, right=main, left_on="movieId", right_on="movieId")
     expected = main.join(main, on="movieId", how="left")
     assert_frame_equal(joined, expected)
 
 
-@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG)
 def test_simple_agg():
     aggregated = aggregate(
         table=main,
@@ -42,7 +41,6 @@ def test_simple_agg():
     assert_frame_equal(aggregated, expected, check_row_order=False)
 
 
-@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG)
 def test_mode_agg():
     aggregated = aggregate(
         table=main,
@@ -59,7 +57,6 @@ def test_mode_agg():
     assert_frame_equal(aggregated, expected, check_row_order=False)
 
 
-@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG)
 def test_incorrect_dataframe_inputs():
     with pytest.raises(TypeError, match=r"(?=.*polars dataframes)(?=.*pandas)"):
         join(left=pd.DataFrame(main), right=main, left_on="movieId", right_on="movieId")
@@ -76,15 +73,20 @@ def test_incorrect_dataframe_inputs():
 def test_make_dataframe():
     X = dict(a=[1, 2], b=["z", "e"])
     expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"]))
-    assert_frame_equal(make_dataframe(X, index=[1, 2]), expected_df)
+    assert_frame_equal(make_dataframe(X), expected_df)
 
     X = [[1, 2], ["z", "e"]]
     with pytest.raises(TypeError):
         make_dataframe(X)
 
+    with pytest.raises(ValueError, match=r"(?=.*Polars dataframe)(?=.*index)"):
+        make_dataframe(X, index=[0, 1])
+
 
-@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG)
 def test_make_series():
     X = [1, 2, 3]
     expected_series = pl.Series(X)
-    assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series)
+    assert_series_equal(make_series(X, index=None), expected_series)
+
+    with pytest.raises(ValueError, match=r"(?=.*Polars series)(?=.*index)"):
+        make_series(X, index=[0, 1])

From 386ba6a7b807a05ca32b68f6c480ee28a8fb011d Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 31 Oct 2023 18:00:40 +0100
Subject: [PATCH 4/7] from dataframe to _dataframe

---
 doc/api.rst                             |  40 ---
 skrub/_agg_joiner.py                    |  56 ++--
 skrub/dataframe/__init__.py             |  19 --
 skrub/dataframe/_namespace.py           | 103 -------
 skrub/dataframe/_pandas.py              | 339 ------------------------
 skrub/dataframe/_polars.py              | 278 -------------------
 skrub/dataframe/_types.py               |  14 -
 skrub/dataframe/tests/__init__.py       |   0
 skrub/dataframe/tests/test_namespace.py |  41 ---
 skrub/dataframe/tests/test_pandas.py    | 113 --------
 skrub/dataframe/tests/test_polars.py    |  92 -------
 skrub/tests/test_agg_joiner.py          |   2 +-
 12 files changed, 20 insertions(+), 1077 deletions(-)
 delete mode 100644 skrub/dataframe/__init__.py
 delete mode 100644 skrub/dataframe/_namespace.py
 delete mode 100644 skrub/dataframe/_pandas.py
 delete mode 100644 skrub/dataframe/_polars.py
 delete mode 100644 skrub/dataframe/_types.py
 delete mode 100644 skrub/dataframe/tests/__init__.py
 delete mode 100644 skrub/dataframe/tests/test_namespace.py
 delete mode 100644 skrub/dataframe/tests/test_pandas.py
 delete mode 100644 skrub/dataframe/tests/test_polars.py

diff --git a/doc/api.rst b/doc/api.rst
index 3604abc9b..df1a0a393 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -87,46 +87,6 @@ This page lists all available functions and classes of `skrub`.
 
    deduplicate
 
-.. raw:: html   
-
-   <h2>Dataframes operations</h2>
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-   :nosignatures:
-   :caption: DataFrames operations
-
-   dataframe.get_df_namespace
-
-.. raw:: html
-
-   <h3>Pandas</h3>
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-   :nosignatures:
-   :caption: Pandas operations
-
-   dataframe.is_pandas
-   dataframe.pd_aggregate
-   dataframe.pd_join
-
-.. raw:: html
-
-   <h3>Polars</h3>
-
-.. autosummary::
-   :toctree: generated/
-   :template: function.rst
-   :nosignatures:
-   :caption: Polars operations
-
-   dataframe.is_polars
-   dataframe.pl_aggregate
-   dataframe.pl_join
-
 .. raw:: html
 
    <h2>Data download and generation</h2>
diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py
index 7e3e3fb2b..effc833dc 100644
--- a/skrub/_agg_joiner.py
+++ b/skrub/_agg_joiner.py
@@ -9,15 +9,13 @@
 from typing import Iterable
 
 import numpy as np
-from numpy.typing import ArrayLike
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import check_is_fitted
 
+from skrub._dataframe._namespace import get_df_namespace
+from skrub._dataframe._pandas import _parse_argument
 from skrub._utils import atleast_1d_or_none, atleast_2d_or_none
-from skrub.dataframe import DataFrameLike, SeriesLike
-from skrub.dataframe._namespace import get_df_namespace
-from skrub.dataframe._pandas import _parse_argument
 
 NUM_OPERATIONS = ["sum", "mean", "std", "min", "max", "hist", "value_counts"]
 CATEG_OPERATIONS = ["mode", "count", "value_counts"]
@@ -52,10 +50,10 @@ def split_num_categ_operations(operations: list[str]) -> tuple[list[str], list[s
 
 
 def check_missing_columns(
-    X: DataFrameLike,
-    columns: list[str],
-    error_msg: str,
-) -> None:
+    X,
+    columns,
+    error_msg,
+):
     """All elements of main_key must belong to the columns of X.
 
     Parameters
@@ -161,13 +159,13 @@ class AggJoiner(BaseEstimator, TransformerMixin):
 
     def __init__(
         self,
-        aux_table: DataFrameLike | Iterable[DataFrameLike] | str | Iterable[str],
+        aux_table,
         *,
-        aux_key: str | Iterable[str],
-        main_key: str | Iterable[str],
-        cols: str | Iterable[str] | None = None,
-        operation: str | Iterable[str] | None = None,
-        suffix: str | Iterable[str] | None = None,
+        aux_key,
+        main_key,
+        cols=None,
+        operation=None,
+        suffix=None,
     ):
         self.aux_table = aux_table
         self.aux_key = aux_key
@@ -176,11 +174,7 @@ def __init__(
         self.operation = operation
         self.suffix = suffix
 
-    def fit(
-        self,
-        X: DataFrameLike,
-        y: ArrayLike | SeriesLike | None = None,
-    ) -> "AggJoiner":
+    def fit(self, X, y=None):
         """Aggregate auxiliary tables based on the main keys.
 
         Parameters
@@ -221,7 +215,7 @@ def fit(
 
         return self
 
-    def transform(self, X: DataFrameLike) -> DataFrameLike:
+    def transform(self, X):
         """Left-join pre-aggregated tables on `X`.
 
         Parameters
@@ -248,18 +242,14 @@ def transform(self, X: DataFrameLike) -> DataFrameLike:
 
         return X
 
-    def _screen(
-        self,
-        aux_table: DataFrameLike,
-        y: DataFrameLike | SeriesLike | ArrayLike,
-    ) -> DataFrameLike:
+    def _screen(self, aux_table, y):
         """Only keep aggregated features which correlation with
         y is above some threshold.
         """
         # TODO: Add logic
         return aux_table
 
-    def check_input(self, X: DataFrameLike) -> None:
+    def check_input(self, X):
         """Perform a check on column names data type and suffixes.
 
         Parameters
@@ -452,11 +442,7 @@ def __init__(
         self.operation = operation
         self.suffix = suffix
 
-    def fit(
-        self,
-        X: DataFrameLike,
-        y: DataFrameLike | SeriesLike | ArrayLike,
-    ) -> "AggTarget":
+    def fit(self, X, y):
         """Aggregate the target ``y`` based on keys from ``X``.
 
         Parameters
@@ -501,7 +487,7 @@ def fit(
 
         return self
 
-    def transform(self, X: DataFrameLike) -> DataFrameLike:
+    def transform(self, X):
         """Left-join pre-aggregated tables on `X`.
 
         Parameters
@@ -524,11 +510,7 @@ def transform(self, X: DataFrameLike) -> DataFrameLike:
             right_on=self.main_key_,
         )
 
-    def check_input(
-        self,
-        X: DataFrameLike,
-        y: DataFrameLike | SeriesLike | ArrayLike,
-    ) -> DataFrameLike:
+    def check_input(self, X, y):
         """Perform a check on column names data type and suffixes.
 
         Parameters
diff --git a/skrub/dataframe/__init__.py b/skrub/dataframe/__init__.py
deleted file mode 100644
index a89e910e6..000000000
--- a/skrub/dataframe/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from skrub.dataframe._namespace import get_df_namespace, is_pandas, is_polars
-from skrub.dataframe._pandas import aggregate as pd_aggregate
-from skrub.dataframe._pandas import join as pd_join
-from skrub.dataframe._polars import aggregate as pl_aggregate
-from skrub.dataframe._polars import join as pl_join
-from skrub.dataframe._types import POLARS_SETUP, DataFrameLike, SeriesLike
-
-__all__ = [
-    POLARS_SETUP,
-    DataFrameLike,
-    SeriesLike,
-    get_df_namespace,
-    is_pandas,
-    is_polars,
-    pd_join,
-    pd_aggregate,
-    pl_join,
-    pl_aggregate,
-]
diff --git a/skrub/dataframe/_namespace.py b/skrub/dataframe/_namespace.py
deleted file mode 100644
index ebd6516b1..000000000
--- a/skrub/dataframe/_namespace.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import sys
-from types import ModuleType
-
-import pandas as pd
-
-import skrub.dataframe._pandas as skrub_pd
-import skrub.dataframe._polars as skrub_pl
-from skrub.dataframe._types import DataFrameLike
-
-
-def is_pandas(dataframe: DataFrameLike) -> bool:
-    """Check whether the input is a Pandas dataframe.
-
-    Parameters
-    ----------
-    dataframe : DataFrameLike
-        The input dataframe
-
-    Returns
-    -------
-    is_pandas : bool
-        Whether the dataframe is a Pandas dataframe or not.
-    """
-    return isinstance(dataframe, pd.DataFrame)
-
-
-def is_polars(dataframe: DataFrameLike) -> bool:
-    """Check whether the input is a Polars dataframe or lazyframe.
-
-    Parameters
-    ----------
-    dataframe : DataFrameLike
-        The input dataframe
-
-    Returns
-    -------
-    is_polars : bool
-        Whether the dataframe is a Polars dataframe/lazyframe or not.
-    """
-    if "polars" not in sys.modules:
-        return False
-
-    import polars as pl
-
-    return isinstance(dataframe, (pl.DataFrame, pl.LazyFrame))
-
-
-def get_df_namespace(
-    *dfs: DataFrameLike | list[DataFrameLike],
-) -> tuple[ModuleType, ModuleType]:
-    """Get the namespaces of dataframes.
-
-    Introspects dataframes and returns their skrub namespace object
-    ``skrub.dataframe._{pandas, polars}`` and the dataframe module
-    ``{polars, pandas}`` itself.
-
-    The dataframes passed in input need to come from the same module, otherwise a
-    ``TypeError`` will be raised.
-
-    The outputs of this function are denoted ``skrub_px`` and ``px`` in reference to
-    the array API, returning namespace (NumPy, PyTorch and CuPy) as ``nx``.
-    Since we deal with Polars (``pl``) and Pandas (``pd``), we use ``px``
-    as a variable name.
-
-    Parameters
-    ----------
-    dfs : DataFrameLike | list[DataFrameLike],
-        The dataframes to extract modules from.
-
-    Returns
-    -------
-    skrub_px : ModuleType
-        Skrub namespace shared by dataframe objects.
-
-    px : ModuleType
-        Dataframe namespace, i.e. Pandas or Polars module.
-    """
-    # FIXME Pandas and Polars series will raise errors.
-    if all([is_pandas(df) for df in dfs]):
-        return skrub_pd, pd
-
-    elif all([is_polars(df) for df in dfs]):
-        import polars as pl
-
-        if all([isinstance(df, pl.DataFrame) for df in dfs]) or all(
-            [isinstance(df, pl.LazyFrame) for df in dfs]
-        ):
-            return skrub_pl, pl
-        else:
-            raise TypeError("Mixing Polars lazyframes and dataframes is not supported.")
-
-    else:
-        modules = [type(df).__module__ for df in dfs]
-        if all([is_polars(df) or is_pandas(df) for df in dfs]):
-            raise TypeError(
-                "Mixing Pandas and Polars dataframes is not supported, "
-                f"got {modules=!r}."
-            )
-        else:
-            raise TypeError(
-                "Only Pandas or Polars dataframes are currently supported, "
-                f"got {modules=!r}."
-            )
diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py
deleted file mode 100644
index b3be557ec..000000000
--- a/skrub/dataframe/_pandas.py
+++ /dev/null
@@ -1,339 +0,0 @@
-"""
-Pandas specialization of the aggregate and join operation.
-"""
-import re
-from collections.abc import Callable
-from itertools import product
-from typing import Iterable
-
-import numpy as np
-import pandas as pd
-
-from skrub._utils import atleast_1d_or_none
-
-
-def make_dataframe(X, index=None):
-    """Convert an dictionary of columns into a Pandas dataframe.
-
-    Parameters
-    ----------
-    X : mapping from column name to 1d iterable
-        Input data to convert.
-
-    index : 1d array-like, default=None
-        The index of the dataframe.
-
-    Returns
-    -------
-    X : Pandas dataframe
-        Converted output.
-    """
-    if not isinstance(X, dict) or not all(
-        (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1)
-        for X_col in X.values()
-    ):
-        raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.")
-    return pd.DataFrame(X, index=index)
-
-
-def make_series(X, index=None, name=None):
-    """Convert an 1d array into a Pandas series.
-
-    Parameters
-    ----------
-    X : 1d iterable
-        Input data to convert.
-
-    index : 1d array-like, default=None
-        The index of the series.
-
-    name : str, default=None
-        The name of the series.
-
-    Returns
-    -------
-    X : Pandas series
-        Converted output.
-    """
-    return pd.Series(X, index=index, name=name)
-
-
-def aggregate(
-    table: pd.DataFrame,
-    key: str | Iterable[str],
-    cols_to_agg: str | Iterable[str],
-    num_operations: str | Iterable[str] = ("mean",),
-    categ_operations: str | Iterable[str] = ("mode",),
-    suffix: str | None = None,
-) -> pd.DataFrame:
-    """Aggregates a :obj:`pandas.DataFrame`.
-
-    This function uses the ``dataframe.groupby(key).agg`` method from Pandas.
-
-    Parameters
-    ----------
-    table : pd.DataFrame,
-        The input dataframe to aggregate.
-
-    key : str or Iterable[str],
-        The columns used as keys to aggregate on.
-
-    cols_to_agg : str or Iterable[str],
-        The columns to aggregate.
-
-    num_operations : str or Iterable[str],
-        The reduction functions to apply on numerical columns
-        in ``cols_to_agg`` during the aggregation.
-
-    categ_operations : str or Iterable[str],
-        The reduction functions to apply on categorical columns
-        in ``cols_to_agg`` during the aggregation.
-
-    suffix : str, optional
-        The suffix appended to output columns.
-
-    Returns
-    -------
-    group : pd.DataFrame,
-        The aggregated output.
-    """
-    if not isinstance(table, pd.DataFrame):
-        raise TypeError(f"'table' must be a pandas dataframe, got {type(table)!r}.")
-
-    key = atleast_1d_or_none(key)
-    cols_to_agg = atleast_1d_or_none(cols_to_agg)
-    num_operations = atleast_1d_or_none(num_operations)
-    categ_operations = atleast_1d_or_none(categ_operations)
-    suffix = "" if suffix is None else suffix
-
-    num_cols, categ_cols = split_num_categ_cols(table[cols_to_agg])
-
-    num_named_agg, num_value_counts = get_named_agg(table, num_cols, num_operations)
-    categ_named_agg, categ_value_counts = get_named_agg(
-        table, categ_cols, categ_operations
-    )
-
-    named_agg = {**num_named_agg, **categ_named_agg}
-    if named_agg:
-        base_group = table.groupby(key).agg(**named_agg)
-    else:
-        base_group = None
-
-    # 'histogram' and 'value_counts' requires a pivot
-    value_counts = {**num_value_counts, **categ_value_counts}
-    for output_key, (col_to_agg, kwargs) in value_counts.items():
-        serie_group = table.groupby(key)[col_to_agg].value_counts(**kwargs)
-        serie_group.name = output_key
-        pivot = (
-            serie_group.reset_index()
-            .pivot(index=key, columns=col_to_agg)
-            .reset_index()
-            .fillna(0)
-        )
-        cols = pivot.columns.droplevel(0)
-        index_cols = np.atleast_1d(key).tolist()
-        feature_cols = (f"{col_to_agg}_" + cols[len(index_cols) :].astype(str)).tolist()
-        cols = [*index_cols, *feature_cols]
-        pivot.columns = cols
-
-        if base_group is None:
-            base_group = pivot
-        else:
-            base_group = base_group.merge(pivot, on=key, how="left")
-
-    if base_group is None:
-        raise ValueError("No aggregation to perform.")
-
-    base_group.columns = [
-        f"{col}{suffix}" if col not in key else col for col in base_group.columns
-    ]
-    sorted_cols = sorted(base_group.columns)
-
-    return base_group[sorted_cols]
-
-
-def join(
-    left: pd.DataFrame,
-    right: pd.DataFrame,
-    left_on: str | Iterable[str],
-    right_on: str | Iterable[str],
-) -> pd.DataFrame:
-    """Left join two :obj:`pandas.DataFrame`.
-
-    This function uses the ``dataframe.merge`` method from Pandas.
-
-    Parameters
-    ----------
-    left : pd.DataFrame,
-        The left dataframe to left-join.
-
-    right : pd.DataFrame,
-        The right dataframe to left-join.
-
-    left_on : str or Iterable[str]
-        Left keys to merge on.
-
-    right_on : str or Iterable[str]
-        Right keys to merge on.
-
-    Returns
-    -------
-    merged : pd.DataFrame,
-        The merged output.
-    """
-    if not (isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame)):
-        raise TypeError(
-            "'left' and 'right' must be pandas dataframes, "
-            f"got {type(left)!r} and {type(right)!r}."
-        )
-    return left.merge(
-        right,
-        how="left",
-        left_on=left_on,
-        right_on=right_on,
-    )
-
-
-def get_named_agg(
-    table: pd.DataFrame, cols: list[str], operations: list[str]
-) -> tuple[dict, dict]:
-    """Map aggregation tuples to their output key.
-
-    The dictionary has the form: output_key = (column, aggfunc).
-    This is used as input for the ``dataframe.agg`` method from Pandas.
-
-    'value_counts' and 'hist' operation require to pivot
-    the tables and treated in a separate mapping.
-
-    Parameters
-    ----------
-    table : pd.DataFrame,
-        Input dataframe, only used to compute bins values if
-        'value_counts' or 'hist' are operations.
-
-    cols : list,
-        The columns to aggregate.
-
-    operations : list,
-        The reduce operations to perform.
-
-    Returns
-    -------
-    named_agg : dict,
-        Named aggregation mapping.
-
-    value_counts : dict,
-        ``value_counts`` operations mapping.
-    """
-    named_agg, value_counts = {}, {}
-    for col, operation in product(cols, operations):
-        op_root, bin_args = _parse_argument(operation)
-        aggfunc, bin_args = _get_aggfunc(table[col], op_root, bin_args)
-
-        output_key = f"{col}_{op_root}"
-        # 'value_counts' change the index of the resulting frame
-        # and must be treated separately.
-        if aggfunc == "value_counts":
-            value_counts[output_key] = (col, bin_args)
-        else:
-            named_agg[output_key] = (col, aggfunc)
-
-    return named_agg, value_counts
-
-
-def _parse_argument(operation: str) -> tuple[str, int | None]:
-    """Split a text input into a function name and its argument.
-
-    Parameters
-    ----------
-    operation : str,
-        The operation to parse.
-
-    Returns
-    -------
-    operation_root : str,
-        The name of the operation before parenthesis, if any.
-
-    bin_args : int,
-        The number of bin to create for ``hist`` or ``value_counts``.
-
-    Examples
-    --------
-    >>> _parse_argument("hist(10)")
-    ('hist', 10)
-    """
-    split = re.split("\\(.+\\)", operation)
-    op_root = split[0]
-    if len(split) > 1:
-        # remove op_root
-        bin_args = re.split(f"^{op_root}", operation)
-        bin_args = bin_args[1]
-        # remove parenthesis
-        bin_args = re.sub("\\(|\\)", "", bin_args)
-        bin_args = int(bin_args)
-        return op_root, bin_args
-    else:
-        return op_root, None
-
-
-PANDAS_OPS_MAPPING = {
-    "mode": pd.Series.mode,
-    "quantile": pd.Series.quantile,
-    "hist": "value_counts",
-}
-
-
-def _get_aggfunc(
-    serie: pd.Series, op_root: str, n_bins: int
-) -> tuple[str | Callable, dict]:
-    """Map operation roots to their pandas agg functions.
-
-    When args is provided for histogram or value_counts,
-    we create args
-
-    Parameters
-    ----------
-    serie : pd.Series,
-        Input series, used to compute the bins if n_bins is provided.
-
-    op_root : str,
-        Operation root, the operation without the bin argument, if any.
-
-    n_bins : int,
-        The number of bin to create when value_counts or hist operation are used.
-
-    Returns
-    -------
-    aggfunc : str or callable,
-        The pandas agg functions to perform
-
-    bins_args : dict,
-        The bins to create when using value_counts or hist.
-    """
-    aggfunc = PANDAS_OPS_MAPPING.get(op_root, op_root)
-
-    if n_bins is not None:
-        # histogram and value_counts
-        if aggfunc == "value_counts":
-            # If bins is a number, we need to set a fix bin range,
-            # otherwise bins edges will be defined dynamically for
-            # each rows.
-            min_, max_ = serie.min(), serie.max()
-            bins = np.linspace(min_, max_, n_bins + 1)
-            bins_args = dict(bins=bins)
-        else:
-            raise ValueError(
-                f"Operator {op_root!r} doesn't take any argument, got {n_bins!r}"
-            )
-    else:
-        bins_args = {}
-
-    return aggfunc, bins_args
-
-
-def split_num_categ_cols(table):
-    """Split dataframe columns between numerical and categorical."""
-    num_cols = table.select_dtypes("number").columns
-    categ_cols = table.select_dtypes(["object", "string", "category"]).columns
-
-    return num_cols, categ_cols
diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py
deleted file mode 100644
index bf4e30f7f..000000000
--- a/skrub/dataframe/_polars.py
+++ /dev/null
@@ -1,278 +0,0 @@
-"""
-Polars specialization of the aggregate and join operations.
-"""
-from typing import Iterable
-
-import numpy as np
-
-from skrub.dataframe._types import POLARS_SETUP, DataFrameLike
-
-if POLARS_SETUP:
-    import polars as pl
-    import polars.selectors as cs
-
-from itertools import product
-
-from skrub._utils import atleast_1d_or_none
-
-
-def make_dataframe(X, index=None):
-    """Convert an dictionary of columns into a Polars dataframe.
-
-    Parameters
-    ----------
-    X : mapping from column name to 1d iterable
-        Input data to convert.
-
-    index : 1d array-like, default=None
-        Unused since polars doesn't use index.
-        Only here for compatibility with Pandas.
-
-    Returns
-    -------
-    X : Polars dataframe
-        Converted output.
-    """
-    if index is not None:
-        raise ValueError(
-            "Polars dataframes don't have an index, but "
-            f"the Polars dataframe maker was called with {index=!r}."
-        )
-    if not isinstance(X, dict) or not all(
-        (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1)
-        for X_col in X.values()
-    ):
-        raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.")
-    return pl.DataFrame(X)
-
-
-def make_series(X, index=None, name=None):
-    """Convert an 1d array into a Polars series.
-
-    Parameters
-    ----------
-    X : 1d iterable
-        Input data to convert.
-
-    index : 1d array-like, default=None
-        Unused since polars doesn't use index.
-        Only here for compatibility with Pandas.
-
-    name : str, default=None
-        The name of the series.
-
-    Returns
-    -------
-    X : Polars series
-        Converted output.
-    """
-    if index is not None:
-        raise ValueError(
-            "Polars series don't have an index, but "
-            f"the Polars series maker was called with {index=!r}."
-        )
-    return pl.Series(values=X, name=name)
-
-
-def aggregate(
-    table: DataFrameLike,
-    key: str | Iterable[str],
-    cols_to_agg: str | Iterable[str],
-    num_operations: str | Iterable[str] = ("mean",),
-    categ_operations: str | Iterable[str] = ("mode",),
-    suffix: str | None = None,
-) -> DataFrameLike:
-    """Aggregate a :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`.
-
-    This function uses the ``dataframe.group_by(key).agg`` method from Polars.
-
-    Parameters
-    ----------
-    table : pl.DataFrame or pl.LazyFrame,
-        The input dataframe to aggregate.
-
-    key : str or Iterable[str],
-        The columns used as keys to aggregate on.
-
-    cols_to_agg : str or Iterable[str],
-        The columns to aggregate.
-
-    num_operations : str or Iterable[str],
-        The reduction functions to apply on numerical columns
-        in ``cols_to_agg`` during the aggregation.
-
-    categ_operations : str or Iterable[str],
-        The reduction functions to apply on categorical columns
-        in ``cols_to_agg`` during the aggregation.
-
-    suffix : str,
-        The suffix appended to output columns.
-
-    Returns
-    -------
-    group : pl.DataFrame or pl.LazyFrame,
-        The aggregated output.
-    """
-    if not isinstance(table, (pl.DataFrame, pl.LazyFrame)):
-        raise TypeError(
-            f"'table' must be a polars dataframe or lazyframe, got {type(table)!r}."
-        )
-
-    key = atleast_1d_or_none(key)
-    cols_to_agg = atleast_1d_or_none(cols_to_agg)
-    num_operations = atleast_1d_or_none(num_operations)
-    categ_operations = atleast_1d_or_none(categ_operations)
-    suffix = "" if suffix is None else suffix
-
-    num_cols, categ_cols = split_num_categ_cols(table.select(cols_to_agg))
-
-    num_aggfuncs, num_mode_cols = get_aggfuncs(num_cols, num_operations)
-    categ_aggfuncs, categ_mode_cols = get_aggfuncs(categ_cols, categ_operations)
-
-    aggfuncs = [*num_aggfuncs, *categ_aggfuncs]
-    # If aggfuncs is empty, the output will be a series of index.
-    table = table.group_by(key).agg(aggfuncs)
-
-    # flattening post-processing of mode() cols
-    flatten_ops = []
-    for col in [*num_mode_cols, *categ_mode_cols]:
-        flatten_ops.append(pl.col(col).list[0].alias(col))
-    # add columns, no-op if 'flatten_ops' is empty.
-    table = table.with_columns(flatten_ops)
-
-    cols_renaming = {col: f"{col}{suffix}" for col in table.columns if col not in key}
-    table = table.rename(cols_renaming)
-    sorted_cols = sorted(table.columns)
-
-    return table.select(sorted_cols)
-
-
-def join(
-    left: DataFrameLike,
-    right: DataFrameLike,
-    left_on: str | Iterable[str],
-    right_on: str | Iterable[str],
-) -> DataFrameLike:
-    """Left join two :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`.
-
-    This function uses the ``dataframe.join`` method from Polars.
-
-    Note that the input dataframes type must agree: either both
-    Polars dataframes or both Polars lazyframes.
-
-    Mixing polars dataframe with lazyframe will raise an error.
-
-    Parameters
-    ----------
-    left : pl.DataFrame or pl.LazyFrame,
-        The left dataframe of the left-join.
-
-    right : pl.DataFrame or pl.LazyFrame,
-        The right dataframe of the left-join.
-
-    left_on : str or Iterable[str],
-        Left keys to merge on.
-
-    right_on : str or Iterable[str],
-        Right keys to merge on.
-
-    Returns
-    -------
-    merged : pl.DataFrame or pl.LazyFrame,
-        The merged output.
-    """
-    is_dataframe = isinstance(left, pl.DataFrame) and isinstance(right, pl.DataFrame)
-    is_lazyframe = isinstance(left, pl.LazyFrame) and isinstance(right, pl.LazyFrame)
-    if is_dataframe or is_lazyframe:
-        return left.join(
-            right,
-            how="left",
-            left_on=left_on,
-            right_on=right_on,
-        )
-    else:
-        raise TypeError(
-            "'left' and 'right' must be polars dataframes or lazyframes, "
-            f"got {type(left)!r} and {type(right)!r}."
-        )
-
-
-def get_aggfuncs(
-    cols: list[str],
-    operations: list[str],
-) -> tuple[list, list]:
-    """List Polars aggregation functions.
-
-    The list is used as input for the ``dataframe.group_by().agg()`` method from Polars.
-    The 'mode' operation needs a flattening post-processing.
-
-    Parameters
-    ----------
-    cols : list,
-        The columns to aggregate.
-
-    operations : list,
-        The reduce operations to perform.
-
-    Returns
-    -------
-    aggfuncs : list,
-        Named aggregation list.
-
-    mode_cols : list,
-        Output keys to post-process after 'mode' aggregation.
-    """
-    aggfuncs, mode_cols = [], []
-    for col, operation in product(cols, operations):
-        output_key = f"{col}_{operation}"
-        aggfunc = _polars_ops_mapping(col, operation, output_key)
-        aggfuncs.append(aggfunc)
-
-        if operation == "mode":
-            mode_cols.append(output_key)
-
-    return aggfuncs, mode_cols
-
-
-def _polars_ops_mapping(col, operation, output_key):
-    """Map an operation to its Polars expression.
-
-    Parameters
-    ----------
-    col : str,
-        Name of the column to aggregate.
-    operation : str,
-        Name of the reduce function.
-    output_key : str,
-        Name of the reduced column.
-
-    Returns
-    -------
-    aggfunc: polars.Expression,
-        The expression to apply.
-    """
-    polars_aggfuncs = {
-        "mean": pl.col(col).mean(),
-        "std": pl.col(col).std(),
-        "sum": pl.col(col).sum(),
-        "min": pl.col(col).min(),
-        "max": pl.col(col).max(),
-        "mode": pl.col(col).mode(),
-    }
-    aggfunc = polars_aggfuncs.get(operation, None)
-
-    if aggfunc is None:
-        raise ValueError(
-            f"Polars operation {operation!r} is not supported. Available:"
-            f" {list(polars_aggfuncs)}"
-        )
-
-    return aggfunc.alias(output_key)
-
-
-def split_num_categ_cols(table):
-    """Split a dataframe columns between numerical and categorical."""
-    num_cols = table.select(cs.numeric()).columns
-    categ_cols = table.select(cs.string()).columns
-
-    return num_cols, categ_cols
diff --git a/skrub/dataframe/_types.py b/skrub/dataframe/_types.py
deleted file mode 100644
index 0468a613c..000000000
--- a/skrub/dataframe/_types.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import pandas as pd
-
-try:
-    import polars as pl
-
-    POLARS_SETUP = True
-except ImportError:
-    POLARS_SETUP = False
-
-DataFrameLike = pd.DataFrame
-SeriesLike = pd.Series
-if POLARS_SETUP:
-    DataFrameLike |= pl.DataFrame | pl.LazyFrame
-    SeriesLike |= pl.Series
diff --git a/skrub/dataframe/tests/__init__.py b/skrub/dataframe/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/skrub/dataframe/tests/test_namespace.py b/skrub/dataframe/tests/test_namespace.py
deleted file mode 100644
index bccdbabe9..000000000
--- a/skrub/dataframe/tests/test_namespace.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import pandas as pd
-import pytest
-
-import skrub.dataframe._pandas as skrub_pd
-import skrub.dataframe._polars as skrub_pl
-from skrub.dataframe import POLARS_SETUP, get_df_namespace
-
-main = pd.DataFrame(
-    {
-        "userId": [1, 1, 1, 2, 2, 2],
-        "movieId": [1, 3, 6, 318, 6, 1704],
-        "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0],
-        "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
-    }
-)
-
-
-def test_get_namespace_pandas():
-    skrub_px, px = get_df_namespace(main, main)
-    assert skrub_px is skrub_pd
-    assert px is pd
-
-    with pytest.raises(TypeError, match=r"(?=.*Only Pandas or Polars)(?=.*supported)"):
-        get_df_namespace(main, main.values)
-
-
-@pytest.mark.skipif(not POLARS_SETUP, reason="Polars is not available")
-def test_get_namespace_polars():
-    import polars as pl
-
-    skrub_px, px = get_df_namespace(pl.DataFrame(main), pl.DataFrame(main))
-    assert skrub_px is skrub_pl
-    assert px is pl
-
-    with pytest.raises(TypeError, match=r"(?=.*Mixing Pandas)(?=.*Polars)"):
-        get_df_namespace(main, pl.DataFrame(main))
-
-    with pytest.raises(
-        TypeError, match=r"(?=.*Mixing)(?=.*lazyframes)(?=.*dataframes)"
-    ):
-        get_df_namespace(pl.DataFrame(main), pl.LazyFrame(main))
diff --git a/skrub/dataframe/tests/test_pandas.py b/skrub/dataframe/tests/test_pandas.py
deleted file mode 100644
index 1453dd230..000000000
--- a/skrub/dataframe/tests/test_pandas.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import pandas as pd
-import pytest
-from pandas.testing import assert_frame_equal, assert_series_equal
-
-from skrub.dataframe._pandas import aggregate, join, make_dataframe, make_series
-
-main = pd.DataFrame(
-    {
-        "userId": [1, 1, 1, 2, 2, 2],
-        "movieId": [1, 3, 6, 318, 6, 1704],
-        "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0],
-        "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
-    }
-)
-
-
-def test_join():
-    joined = join(left=main, right=main, left_on="movieId", right_on="movieId")
-    expected = main.merge(main, on="movieId", how="left")
-    assert_frame_equal(joined, expected)
-
-
-def test_simple_agg():
-    aggregated = aggregate(
-        table=main,
-        key="movieId",
-        cols_to_agg=["rating", "genre"],
-        num_operations="mean",
-        categ_operations="mode",
-    )
-    aggfunc = {
-        "genre_mode": ("genre", pd.Series.mode),
-        "rating_mean": ("rating", "mean"),
-    }
-    expected = main.groupby("movieId").agg(**aggfunc)
-    assert_frame_equal(aggregated, expected)
-
-
-def test_value_counts_agg():
-    aggregated = aggregate(
-        table=main,
-        key="userId",
-        cols_to_agg="rating",
-        num_operations="value_counts",
-        categ_operations=None,
-        suffix="_user",
-    )
-    expected = pd.DataFrame(
-        {
-            "rating_2.0_user": [0.0, 1.0],
-            "rating_3.0_user": [0.0, 1.0],
-            "rating_4.0_user": [3.0, 1.0],
-            "userId": [1, 2],
-        }
-    )
-    assert_frame_equal(aggregated, expected)
-
-    aggregated = aggregate(
-        table=main,
-        key="userId",
-        cols_to_agg="rating",
-        num_operations="hist(2)",
-        categ_operations=None,
-        suffix="_user",
-    )
-    expected = pd.DataFrame(
-        {
-            "rating_(1.999, 3.0]_user": [0, 2],
-            "rating_(3.0, 4.0]_user": [3, 1],
-            "userId": [1, 2],
-        }
-    )
-    assert_frame_equal(aggregated, expected)
-
-
-def test_incorrect_dataframe_inputs():
-    with pytest.raises(TypeError, match=r"(?=.*pandas dataframes)(?=.*array)"):
-        join(left=main.values, right=main, left_on="movieId", right_on="movieId")
-
-    with pytest.raises(TypeError, match=r"(?=.*pandas dataframe)(?=.*array)"):
-        aggregate(
-            table=main.values,
-            key="movieId",
-            cols_to_agg="rating",
-            num_operations="mean",
-        )
-
-
-def test_no_agg_operation():
-    with pytest.raises(ValueError, match=r"(?=.*No aggregation)"):
-        aggregate(
-            table=main,
-            key="movieId",
-            cols_to_agg="rating",
-            num_operations=None,
-            categ_operations=None,
-        )
-
-
-def test_make_dataframe():
-    X = dict(a=[1, 2], b=["z", "e"])
-    expected_df = pd.DataFrame(dict(a=[1, 2], b=["z", "e"]))
-    assert_frame_equal(make_dataframe(X, index=[0, 1]), expected_df)
-
-    X = [[1, 2], ["z", "e"]]
-    with pytest.raises(TypeError):
-        make_dataframe(X)
-
-
-def test_make_series():
-    X = [1, 2, 3]
-    expected_series = pd.Series(X)
-    assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series)
diff --git a/skrub/dataframe/tests/test_polars.py b/skrub/dataframe/tests/test_polars.py
deleted file mode 100644
index c9f8bc4bd..000000000
--- a/skrub/dataframe/tests/test_polars.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import pandas as pd
-import pytest
-
-from skrub.dataframe import POLARS_SETUP
-from skrub.dataframe._polars import aggregate, join, make_dataframe, make_series
-
-if POLARS_SETUP:
-    import polars as pl
-    from polars.testing import assert_frame_equal, assert_series_equal
-
-    main = pl.DataFrame(
-        {
-            "userId": [1, 1, 1, 2, 2, 2],
-            "movieId": [1, 3, 6, 318, 6, 1704],
-            "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0],
-            "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
-        }
-    )
-else:
-    polars_missing_msg = "Polars is not available"
-    pytest.skip(reason=polars_missing_msg, allow_module_level=True)
-
-
-def test_join():
-    joined = join(left=main, right=main, left_on="movieId", right_on="movieId")
-    expected = main.join(main, on="movieId", how="left")
-    assert_frame_equal(joined, expected)
-
-
-def test_simple_agg():
-    aggregated = aggregate(
-        table=main,
-        key="movieId",
-        cols_to_agg="rating",
-        num_operations="mean",
-    )
-    aggfunc = pl.col("rating").mean().alias("rating_mean")
-    expected = main.group_by("movieId").agg(aggfunc)
-    # As group_by parallizes threads, the row order of its output isn't
-    # deterministic. Hence, we need to set check_row_order to False.
-    assert_frame_equal(aggregated, expected, check_row_order=False)
-
-
-def test_mode_agg():
-    aggregated = aggregate(
-        table=main,
-        key="movieId",
-        cols_to_agg="genre",
-        categ_operations=["mode"],
-    )
-    expected = pl.DataFrame(
-        {
-            "genre_mode": ["drama", "drama", "sf", "sf", "comedy"],
-            "movieId": [3, 1, 318, 1704, 6],
-        }
-    )
-    assert_frame_equal(aggregated, expected, check_row_order=False)
-
-
-def test_incorrect_dataframe_inputs():
-    with pytest.raises(TypeError, match=r"(?=.*polars dataframes)(?=.*pandas)"):
-        join(left=pd.DataFrame(main), right=main, left_on="movieId", right_on="movieId")
-
-    with pytest.raises(TypeError, match=r"(?=.*polars dataframe)(?=.*pandas)"):
-        aggregate(
-            table=pd.DataFrame(main),
-            key="movieId",
-            cols_to_agg="rating",
-            num_operations="mean",
-        )
-
-
-def test_make_dataframe():
-    X = dict(a=[1, 2], b=["z", "e"])
-    expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"]))
-    assert_frame_equal(make_dataframe(X), expected_df)
-
-    X = [[1, 2], ["z", "e"]]
-    with pytest.raises(TypeError):
-        make_dataframe(X)
-
-    with pytest.raises(ValueError, match=r"(?=.*Polars dataframe)(?=.*index)"):
-        make_dataframe(X, index=[0, 1])
-
-
-def test_make_series():
-    X = [1, 2, 3]
-    expected_series = pl.Series(X)
-    assert_series_equal(make_series(X, index=None), expected_series)
-
-    with pytest.raises(ValueError, match=r"(?=.*Polars series)(?=.*index)"):
-        make_series(X, index=[0, 1])
diff --git a/skrub/tests/test_agg_joiner.py b/skrub/tests/test_agg_joiner.py
index 6ed8ac2af..bd0ebf440 100644
--- a/skrub/tests/test_agg_joiner.py
+++ b/skrub/tests/test_agg_joiner.py
@@ -3,7 +3,7 @@
 from pandas.testing import assert_frame_equal
 from sklearn.pipeline import make_pipeline
 
-from skrub.dataframe import POLARS_SETUP
+from skrub._dataframe._polars import POLARS_SETUP
 
 if POLARS_SETUP:
     import polars as pl

From 54502dfaf1cadb9f8e6aa07b26e5fb3465db64b4 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Tue, 31 Oct 2023 18:02:43 +0100
Subject: [PATCH 5/7] add private modules

---
 skrub/_dataframe/__init__.py             |   0
 skrub/_dataframe/_namespace.py           |  99 +++++++
 skrub/_dataframe/_pandas.py              | 328 +++++++++++++++++++++++
 skrub/_dataframe/_polars.py              | 263 ++++++++++++++++++
 skrub/_dataframe/tests/__init__.py       |   0
 skrub/_dataframe/tests/test_namespace.py |  42 +++
 skrub/_dataframe/tests/test_pandas.py    | 109 ++++++++
 skrub/_dataframe/tests/test_polars.py    |  93 +++++++
 8 files changed, 934 insertions(+)
 create mode 100644 skrub/_dataframe/__init__.py
 create mode 100644 skrub/_dataframe/_namespace.py
 create mode 100644 skrub/_dataframe/_pandas.py
 create mode 100644 skrub/_dataframe/_polars.py
 create mode 100644 skrub/_dataframe/tests/__init__.py
 create mode 100644 skrub/_dataframe/tests/test_namespace.py
 create mode 100644 skrub/_dataframe/tests/test_pandas.py
 create mode 100644 skrub/_dataframe/tests/test_polars.py

diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/skrub/_dataframe/_namespace.py b/skrub/_dataframe/_namespace.py
new file mode 100644
index 000000000..06c65a2ea
--- /dev/null
+++ b/skrub/_dataframe/_namespace.py
@@ -0,0 +1,99 @@
+import sys
+
+import pandas as pd
+
+import skrub._dataframe._pandas as skrub_pd
+import skrub._dataframe._polars as skrub_pl
+
+
+def is_pandas(dataframe):
+    """Check whether the input is a Pandas dataframe.
+
+    Parameters
+    ----------
+    dataframe : DataFrameLike
+        The input dataframe
+
+    Returns
+    -------
+    is_pandas : bool
+        Whether the dataframe is a Pandas dataframe or not.
+    """
+    return isinstance(dataframe, pd.DataFrame)
+
+
+def is_polars(dataframe):
+    """Check whether the input is a Polars dataframe or lazyframe.
+
+    Parameters
+    ----------
+    dataframe : DataFrameLike
+        The input dataframe
+
+    Returns
+    -------
+    is_polars : bool
+        Whether the dataframe is a Polars dataframe/lazyframe or not.
+    """
+    if "polars" not in sys.modules:
+        return False
+
+    import polars as pl
+
+    return isinstance(dataframe, (pl.DataFrame, pl.LazyFrame))
+
+
+def get_df_namespace(*dfs):
+    """Get the namespaces of dataframes.
+
+    Introspects dataframes and returns their skrub namespace object
+    ``skrub.dataframe._{pandas, polars}`` and the dataframe module
+    ``{polars, pandas}`` itself.
+
+    The dataframes passed in input need to come from the same module, otherwise a
+    ``TypeError`` will be raised.
+
+    The outputs of this function are denoted ``skrub_px`` and ``px`` in reference to
+    the array API, returning namespace (NumPy, PyTorch and CuPy) as ``nx``.
+    Since we deal with Polars (``pl``) and Pandas (``pd``), we use ``px``
+    as a variable name.
+
+    Parameters
+    ----------
+    dfs : DataFrameLike | list[DataFrameLike],
+        The dataframes to extract modules from.
+
+    Returns
+    -------
+    skrub_px : ModuleType
+        Skrub namespace shared by dataframe objects.
+
+    px : ModuleType
+        Dataframe namespace, i.e. Pandas or Polars module.
+    """
+    # FIXME Pandas and Polars series will raise errors.
+    if all([is_pandas(df) for df in dfs]):
+        return skrub_pd, pd
+
+    elif all([is_polars(df) for df in dfs]):
+        import polars as pl
+
+        if all([isinstance(df, pl.DataFrame) for df in dfs]) or all(
+            [isinstance(df, pl.LazyFrame) for df in dfs]
+        ):
+            return skrub_pl, pl
+        else:
+            raise TypeError("Mixing Polars lazyframes and dataframes is not supported.")
+
+    else:
+        modules = [type(df).__module__ for df in dfs]
+        if all([is_polars(df) or is_pandas(df) for df in dfs]):
+            raise TypeError(
+                "Mixing Pandas and Polars dataframes is not supported, "
+                f"got {modules=!r}."
+            )
+        else:
+            raise TypeError(
+                "Only Pandas or Polars dataframes are currently supported, "
+                f"got {modules=!r}."
+            )
diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py
new file mode 100644
index 000000000..c8fb820b1
--- /dev/null
+++ b/skrub/_dataframe/_pandas.py
@@ -0,0 +1,328 @@
+"""
+Pandas specialization of the aggregate and join operation.
+"""
+import re
+from itertools import product
+
+import numpy as np
+import pandas as pd
+
+from skrub._utils import atleast_1d_or_none
+
+
+def make_dataframe(X, index=None):
+    """Convert an dictionary of columns into a Pandas dataframe.
+
+    Parameters
+    ----------
+    X : mapping from column name to 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        The index of the dataframe.
+
+    Returns
+    -------
+    X : Pandas dataframe
+        Converted output.
+    """
+    return pd.DataFrame(X, index=index)
+
+
+def make_series(X, index=None, name=None):
+    """Convert an 1d array into a Pandas series.
+
+    Parameters
+    ----------
+    X : 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        The index of the series.
+
+    name : str, default=None
+        The name of the series.
+
+    Returns
+    -------
+    X : Pandas series
+        Converted output.
+    """
+    return pd.Series(X, index=index, name=name)
+
+
+def aggregate(
+    table,
+    key,
+    cols_to_agg,
+    num_operations=("mean",),
+    categ_operations=("mode",),
+    suffix=None,
+):
+    """Aggregates a :obj:`pandas.DataFrame`.
+
+    This function uses the ``dataframe.groupby(key).agg`` method from Pandas.
+
+    Parameters
+    ----------
+    table : pd.DataFrame,
+        The input dataframe to aggregate.
+
+    key : str or Iterable[str],
+        The columns used as keys to aggregate on.
+
+    cols_to_agg : str or Iterable[str],
+        The columns to aggregate.
+
+    num_operations : str or Iterable[str],
+        The reduction functions to apply on numerical columns
+        in ``cols_to_agg`` during the aggregation.
+
+    categ_operations : str or Iterable[str],
+        The reduction functions to apply on categorical columns
+        in ``cols_to_agg`` during the aggregation.
+
+    suffix : str, optional
+        The suffix appended to output columns.
+
+    Returns
+    -------
+    group : pd.DataFrame,
+        The aggregated output.
+    """
+    if not isinstance(table, pd.DataFrame):
+        raise TypeError(f"'table' must be a pandas dataframe, got {type(table)!r}.")
+
+    key = atleast_1d_or_none(key)
+    cols_to_agg = atleast_1d_or_none(cols_to_agg)
+    num_operations = atleast_1d_or_none(num_operations)
+    categ_operations = atleast_1d_or_none(categ_operations)
+    suffix = "" if suffix is None else suffix
+
+    num_cols, categ_cols = split_num_categ_cols(table[cols_to_agg])
+
+    num_named_agg, num_value_counts = get_named_agg(table, num_cols, num_operations)
+    categ_named_agg, categ_value_counts = get_named_agg(
+        table, categ_cols, categ_operations
+    )
+
+    named_agg = {**num_named_agg, **categ_named_agg}
+    if named_agg:
+        base_group = table.groupby(key).agg(**named_agg)
+    else:
+        base_group = None
+
+    # 'histogram' and 'value_counts' requires a pivot
+    value_counts = {**num_value_counts, **categ_value_counts}
+    for output_key, (col_to_agg, kwargs) in value_counts.items():
+        serie_group = table.groupby(key)[col_to_agg].value_counts(**kwargs)
+        serie_group.name = output_key
+        pivot = (
+            serie_group.reset_index()
+            .pivot(index=key, columns=col_to_agg)
+            .reset_index()
+            .fillna(0)
+        )
+        cols = pivot.columns.droplevel(0)
+        index_cols = np.atleast_1d(key).tolist()
+        feature_cols = (f"{col_to_agg}_" + cols[len(index_cols) :].astype(str)).tolist()
+        cols = [*index_cols, *feature_cols]
+        pivot.columns = cols
+
+        if base_group is None:
+            base_group = pivot
+        else:
+            base_group = base_group.merge(pivot, on=key, how="left")
+
+    if base_group is None:
+        raise ValueError("No aggregation to perform.")
+
+    base_group.columns = [
+        f"{col}{suffix}" if col not in key else col for col in base_group.columns
+    ]
+    sorted_cols = sorted(base_group.columns)
+
+    return base_group[sorted_cols]
+
+
+def join(
+    left,
+    right,
+    left_on,
+    right_on,
+):
+    """Left join two :obj:`pandas.DataFrame`.
+
+    This function uses the ``dataframe.merge`` method from Pandas.
+
+    Parameters
+    ----------
+    left : pd.DataFrame,
+        The left dataframe to left-join.
+
+    right : pd.DataFrame,
+        The right dataframe to left-join.
+
+    left_on : str or Iterable[str]
+        Left keys to merge on.
+
+    right_on : str or Iterable[str]
+        Right keys to merge on.
+
+    Returns
+    -------
+    merged : pd.DataFrame,
+        The merged output.
+    """
+    if not (isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame)):
+        raise TypeError(
+            "'left' and 'right' must be pandas dataframes, "
+            f"got {type(left)!r} and {type(right)!r}."
+        )
+    return left.merge(
+        right,
+        how="left",
+        left_on=left_on,
+        right_on=right_on,
+    )
+
+
+def get_named_agg(table, cols, operations):
+    """Map aggregation tuples to their output key.
+
+    The dictionary has the form: output_key = (column, aggfunc).
+    This is used as input for the ``dataframe.agg`` method from Pandas.
+
+    'value_counts' and 'hist' operation require to pivot
+    the tables and treated in a separate mapping.
+
+    Parameters
+    ----------
+    table : pd.DataFrame,
+        Input dataframe, only used to compute bins values if
+        'value_counts' or 'hist' are operations.
+
+    cols : list,
+        The columns to aggregate.
+
+    operations : list,
+        The reduce operations to perform.
+
+    Returns
+    -------
+    named_agg : dict,
+        Named aggregation mapping.
+
+    value_counts : dict,
+        ``value_counts`` operations mapping.
+    """
+    named_agg, value_counts = {}, {}
+    for col, operation in product(cols, operations):
+        op_root, bin_args = _parse_argument(operation)
+        aggfunc, bin_args = _get_aggfunc(table[col], op_root, bin_args)
+
+        output_key = f"{col}_{op_root}"
+        # 'value_counts' change the index of the resulting frame
+        # and must be treated separately.
+        if aggfunc == "value_counts":
+            value_counts[output_key] = (col, bin_args)
+        else:
+            named_agg[output_key] = (col, aggfunc)
+
+    return named_agg, value_counts
+
+
+def _parse_argument(operation):
+    """Split a text input into a function name and its argument.
+
+    Parameters
+    ----------
+    operation : str,
+        The operation to parse.
+
+    Returns
+    -------
+    operation_root : str,
+        The name of the operation before parenthesis, if any.
+
+    bin_args : int,
+        The number of bin to create for ``hist`` or ``value_counts``.
+
+    Examples
+    --------
+    >>> _parse_argument("hist(10)")
+    ('hist', 10)
+    """
+    split = re.split("\\(.+\\)", operation)
+    op_root = split[0]
+    if len(split) > 1:
+        # remove op_root
+        bin_args = re.split(f"^{op_root}", operation)
+        bin_args = bin_args[1]
+        # remove parenthesis
+        bin_args = re.sub("\\(|\\)", "", bin_args)
+        bin_args = int(bin_args)
+        return op_root, bin_args
+    else:
+        return op_root, None
+
+
+PANDAS_OPS_MAPPING = {
+    "mode": pd.Series.mode,
+    "quantile": pd.Series.quantile,
+    "hist": "value_counts",
+}
+
+
+def _get_aggfunc(serie, op_root, n_bins):
+    """Map operation roots to their pandas agg functions.
+
+    When args is provided for histogram or value_counts,
+    we create args
+
+    Parameters
+    ----------
+    serie : pd.Series,
+        Input series, used to compute the bins if n_bins is provided.
+
+    op_root : str,
+        Operation root, the operation without the bin argument, if any.
+
+    n_bins : int,
+        The number of bin to create when value_counts or hist operation are used.
+
+    Returns
+    -------
+    aggfunc : str or callable,
+        The pandas agg functions to perform
+
+    bins_args : dict,
+        The bins to create when using value_counts or hist.
+    """
+    aggfunc = PANDAS_OPS_MAPPING.get(op_root, op_root)
+
+    if n_bins is not None:
+        # histogram and value_counts
+        if aggfunc == "value_counts":
+            # If bins is a number, we need to set a fix bin range,
+            # otherwise bins edges will be defined dynamically for
+            # each rows.
+            min_, max_ = serie.min(), serie.max()
+            bins = np.linspace(min_, max_, n_bins + 1)
+            bins_args = dict(bins=bins)
+        else:
+            raise ValueError(
+                f"Operator {op_root!r} doesn't take any argument, got {n_bins!r}"
+            )
+    else:
+        bins_args = {}
+
+    return aggfunc, bins_args
+
+
+def split_num_categ_cols(table):
+    """Split dataframe columns between numerical and categorical."""
+    num_cols = table.select_dtypes("number").columns
+    categ_cols = table.select_dtypes(["object", "string", "category"]).columns
+
+    return num_cols, categ_cols
diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py
new file mode 100644
index 000000000..20f7d1913
--- /dev/null
+++ b/skrub/_dataframe/_polars.py
@@ -0,0 +1,263 @@
+"""
+Polars specialization of the aggregate and join operations.
+"""
+try:
+    import polars as pl
+    import polars.selectors as cs
+
+    POLARS_SETUP = True
+except ImportError:
+    POLARS_SETUP = False
+
+from itertools import product
+
+from skrub._utils import atleast_1d_or_none
+
+
+def make_dataframe(X, index=None):
+    """Convert an dictionary of columns into a Polars dataframe.
+
+    Parameters
+    ----------
+    X : mapping from column name to 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        Unused since polars doesn't use index.
+        Only here for compatibility with Pandas.
+
+    Returns
+    -------
+    X : Polars dataframe
+        Converted output.
+    """
+    if index is not None:
+        raise ValueError(
+            "Polars dataframes don't have an index, but "
+            f"the Polars dataframe maker was called with {index=!r}."
+        )
+    return pl.DataFrame(X)
+
+
+def make_series(X, index=None, name=None):
+    """Convert an 1d array into a Polars series.
+
+    Parameters
+    ----------
+    X : 1d iterable
+        Input data to convert.
+
+    index : 1d array-like, default=None
+        Unused since polars doesn't use index.
+        Only here for compatibility with Pandas.
+
+    name : str, default=None
+        The name of the series.
+
+    Returns
+    -------
+    X : Polars series
+        Converted output.
+    """
+    if index is not None:
+        raise ValueError(
+            "Polars series don't have an index, but "
+            f"the Polars series maker was called with {index=!r}."
+        )
+    return pl.Series(values=X, name=name)
+
+
+def aggregate(
+    table,
+    key,
+    cols_to_agg,
+    num_operations=("mean",),
+    categ_operations=("mode",),
+    suffix=None,
+):
+    """Aggregate a :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`.
+
+    This function uses the ``dataframe.group_by(key).agg`` method from Polars.
+
+    Parameters
+    ----------
+    table : pl.DataFrame or pl.LazyFrame,
+        The input dataframe to aggregate.
+
+    key : str or Iterable[str],
+        The columns used as keys to aggregate on.
+
+    cols_to_agg : str or Iterable[str],
+        The columns to aggregate.
+
+    num_operations : str or Iterable[str],
+        The reduction functions to apply on numerical columns
+        in ``cols_to_agg`` during the aggregation.
+
+    categ_operations : str or Iterable[str],
+        The reduction functions to apply on categorical columns
+        in ``cols_to_agg`` during the aggregation.
+
+    suffix : str,
+        The suffix appended to output columns.
+
+    Returns
+    -------
+    group : pl.DataFrame or pl.LazyFrame,
+        The aggregated output.
+    """
+    if not isinstance(table, (pl.DataFrame, pl.LazyFrame)):
+        raise TypeError(
+            f"'table' must be a polars dataframe or lazyframe, got {type(table)!r}."
+        )
+
+    key = atleast_1d_or_none(key)
+    cols_to_agg = atleast_1d_or_none(cols_to_agg)
+    num_operations = atleast_1d_or_none(num_operations)
+    categ_operations = atleast_1d_or_none(categ_operations)
+    suffix = "" if suffix is None else suffix
+
+    num_cols, categ_cols = split_num_categ_cols(table.select(cols_to_agg))
+
+    num_aggfuncs, num_mode_cols = get_aggfuncs(num_cols, num_operations)
+    categ_aggfuncs, categ_mode_cols = get_aggfuncs(categ_cols, categ_operations)
+
+    aggfuncs = [*num_aggfuncs, *categ_aggfuncs]
+    # If aggfuncs is empty, the output will be a series of index.
+    table = table.group_by(key).agg(aggfuncs)
+
+    # flattening post-processing of mode() cols
+    flatten_ops = []
+    for col in [*num_mode_cols, *categ_mode_cols]:
+        flatten_ops.append(pl.col(col).list[0].alias(col))
+    # add columns, no-op if 'flatten_ops' is empty.
+    table = table.with_columns(flatten_ops)
+
+    cols_renaming = {col: f"{col}{suffix}" for col in table.columns if col not in key}
+    table = table.rename(cols_renaming)
+    sorted_cols = sorted(table.columns)
+
+    return table.select(sorted_cols)
+
+
+def join(left, right, left_on, right_on):
+    """Left join two :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`.
+
+    This function uses the ``dataframe.join`` method from Polars.
+
+    Note that the input dataframes type must agree: either both
+    Polars dataframes or both Polars lazyframes.
+
+    Mixing polars dataframe with lazyframe will raise an error.
+
+    Parameters
+    ----------
+    left : pl.DataFrame or pl.LazyFrame,
+        The left dataframe of the left-join.
+
+    right : pl.DataFrame or pl.LazyFrame,
+        The right dataframe of the left-join.
+
+    left_on : str or Iterable[str],
+        Left keys to merge on.
+
+    right_on : str or Iterable[str],
+        Right keys to merge on.
+
+    Returns
+    -------
+    merged : pl.DataFrame or pl.LazyFrame,
+        The merged output.
+    """
+    is_dataframe = isinstance(left, pl.DataFrame) and isinstance(right, pl.DataFrame)
+    is_lazyframe = isinstance(left, pl.LazyFrame) and isinstance(right, pl.LazyFrame)
+    if is_dataframe or is_lazyframe:
+        return left.join(
+            right,
+            how="left",
+            left_on=left_on,
+            right_on=right_on,
+        )
+    else:
+        raise TypeError(
+            "'left' and 'right' must be polars dataframes or lazyframes, "
+            f"got {type(left)!r} and {type(right)!r}."
+        )
+
+
+def get_aggfuncs(cols, operations):
+    """List Polars aggregation functions.
+
+    The list is used as input for the ``dataframe.group_by().agg()`` method from Polars.
+    The 'mode' operation needs a flattening post-processing.
+
+    Parameters
+    ----------
+    cols : list,
+        The columns to aggregate.
+
+    operations : list,
+        The reduce operations to perform.
+
+    Returns
+    -------
+    aggfuncs : list,
+        Named aggregation list.
+
+    mode_cols : list,
+        Output keys to post-process after 'mode' aggregation.
+    """
+    aggfuncs, mode_cols = [], []
+    for col, operation in product(cols, operations):
+        output_key = f"{col}_{operation}"
+        aggfunc = _polars_ops_mapping(col, operation, output_key)
+        aggfuncs.append(aggfunc)
+
+        if operation == "mode":
+            mode_cols.append(output_key)
+
+    return aggfuncs, mode_cols
+
+
+def _polars_ops_mapping(col, operation, output_key):
+    """Map an operation to its Polars expression.
+
+    Parameters
+    ----------
+    col : str,
+        Name of the column to aggregate.
+    operation : str,
+        Name of the reduce function.
+    output_key : str,
+        Name of the reduced column.
+
+    Returns
+    -------
+    aggfunc: polars.Expression,
+        The expression to apply.
+    """
+    polars_aggfuncs = {
+        "mean": pl.col(col).mean(),
+        "std": pl.col(col).std(),
+        "sum": pl.col(col).sum(),
+        "min": pl.col(col).min(),
+        "max": pl.col(col).max(),
+        "mode": pl.col(col).mode(),
+    }
+    aggfunc = polars_aggfuncs.get(operation, None)
+
+    if aggfunc is None:
+        raise ValueError(
+            f"Polars operation {operation!r} is not supported. Available:"
+            f" {list(polars_aggfuncs)}"
+        )
+
+    return aggfunc.alias(output_key)
+
+
+def split_num_categ_cols(table):
+    """Split a dataframe columns between numerical and categorical."""
+    num_cols = table.select(cs.numeric()).columns
+    categ_cols = table.select(cs.string()).columns
+
+    return num_cols, categ_cols
diff --git a/skrub/_dataframe/tests/__init__.py b/skrub/_dataframe/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/skrub/_dataframe/tests/test_namespace.py b/skrub/_dataframe/tests/test_namespace.py
new file mode 100644
index 000000000..b6d944f52
--- /dev/null
+++ b/skrub/_dataframe/tests/test_namespace.py
@@ -0,0 +1,42 @@
+import pandas as pd
+import pytest
+
+import skrub._dataframe._pandas as skrub_pd
+import skrub._dataframe._polars as skrub_pl
+from skrub._dataframe._namespace import get_df_namespace
+from skrub._dataframe._polars import POLARS_SETUP
+
+main = pd.DataFrame(
+    {
+        "userId": [1, 1, 1, 2, 2, 2],
+        "movieId": [1, 3, 6, 318, 6, 1704],
+        "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0],
+        "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
+    }
+)
+
+
+def test_get_namespace_pandas():
+    skrub_px, px = get_df_namespace(main, main)
+    assert skrub_px is skrub_pd
+    assert px is pd
+
+    with pytest.raises(TypeError, match=r"(?=.*Only Pandas or Polars)(?=.*supported)"):
+        get_df_namespace(main, main.values)
+
+
+@pytest.mark.skipif(not POLARS_SETUP, reason="Polars is not available")
+def test_get_namespace_polars():
+    import polars as pl
+
+    skrub_px, px = get_df_namespace(pl.DataFrame(main), pl.DataFrame(main))
+    assert skrub_px is skrub_pl
+    assert px is pl
+
+    with pytest.raises(TypeError, match=r"(?=.*Mixing Pandas)(?=.*Polars)"):
+        get_df_namespace(main, pl.DataFrame(main))
+
+    with pytest.raises(
+        TypeError, match=r"(?=.*Mixing)(?=.*lazyframes)(?=.*dataframes)"
+    ):
+        get_df_namespace(pl.DataFrame(main), pl.LazyFrame(main))
diff --git a/skrub/_dataframe/tests/test_pandas.py b/skrub/_dataframe/tests/test_pandas.py
new file mode 100644
index 000000000..7a245a365
--- /dev/null
+++ b/skrub/_dataframe/tests/test_pandas.py
@@ -0,0 +1,109 @@
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal, assert_series_equal
+
+from skrub._dataframe._pandas import aggregate, join, make_dataframe, make_series
+
+main = pd.DataFrame(
+    {
+        "userId": [1, 1, 1, 2, 2, 2],
+        "movieId": [1, 3, 6, 318, 6, 1704],
+        "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0],
+        "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
+    }
+)
+
+
+def test_join():
+    joined = join(left=main, right=main, left_on="movieId", right_on="movieId")
+    expected = main.merge(main, on="movieId", how="left")
+    assert_frame_equal(joined, expected)
+
+
+def test_simple_agg():
+    aggregated = aggregate(
+        table=main,
+        key="movieId",
+        cols_to_agg=["rating", "genre"],
+        num_operations="mean",
+        categ_operations="mode",
+    )
+    aggfunc = {
+        "genre_mode": ("genre", pd.Series.mode),
+        "rating_mean": ("rating", "mean"),
+    }
+    expected = main.groupby("movieId").agg(**aggfunc)
+    assert_frame_equal(aggregated, expected)
+
+
+def test_value_counts_agg():
+    aggregated = aggregate(
+        table=main,
+        key="userId",
+        cols_to_agg="rating",
+        num_operations="value_counts",
+        categ_operations=None,
+        suffix="_user",
+    )
+    expected = pd.DataFrame(
+        {
+            "rating_2.0_user": [0.0, 1.0],
+            "rating_3.0_user": [0.0, 1.0],
+            "rating_4.0_user": [3.0, 1.0],
+            "userId": [1, 2],
+        }
+    )
+    assert_frame_equal(aggregated, expected)
+
+    aggregated = aggregate(
+        table=main,
+        key="userId",
+        cols_to_agg="rating",
+        num_operations="hist(2)",
+        categ_operations=None,
+        suffix="_user",
+    )
+    expected = pd.DataFrame(
+        {
+            "rating_(1.999, 3.0]_user": [0, 2],
+            "rating_(3.0, 4.0]_user": [3, 1],
+            "userId": [1, 2],
+        }
+    )
+    assert_frame_equal(aggregated, expected)
+
+
+def test_incorrect_dataframe_inputs():
+    with pytest.raises(TypeError, match=r"(?=.*pandas dataframes)(?=.*array)"):
+        join(left=main.values, right=main, left_on="movieId", right_on="movieId")
+
+    with pytest.raises(TypeError, match=r"(?=.*pandas dataframe)(?=.*array)"):
+        aggregate(
+            table=main.values,
+            key="movieId",
+            cols_to_agg="rating",
+            num_operations="mean",
+        )
+
+
+def test_no_agg_operation():
+    with pytest.raises(ValueError, match=r"(?=.*No aggregation)"):
+        aggregate(
+            table=main,
+            key="movieId",
+            cols_to_agg="rating",
+            num_operations=None,
+            categ_operations=None,
+        )
+
+
+def test_make_dataframe():
+    X = dict(a=[1, 2], b=["z", "e"])
+    expected_df = pd.DataFrame(dict(a=[1, 2], b=["z", "e"]))
+    assert_frame_equal(make_dataframe(X, index=[0, 1]), expected_df)
+
+
+def test_make_series():
+    X = [1, 2, 3]
+    expected_series = pd.Series(X)
+    assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series)
diff --git a/skrub/_dataframe/tests/test_polars.py b/skrub/_dataframe/tests/test_polars.py
new file mode 100644
index 000000000..2e55952a7
--- /dev/null
+++ b/skrub/_dataframe/tests/test_polars.py
@@ -0,0 +1,93 @@
+import pandas as pd
+import pytest
+
+from skrub._dataframe._polars import (
+    POLARS_SETUP,
+    aggregate,
+    join,
+    make_dataframe,
+    make_series,
+)
+
+if POLARS_SETUP:
+    import polars as pl
+    from polars.testing import assert_frame_equal, assert_series_equal
+
+    main = pl.DataFrame(
+        {
+            "userId": [1, 1, 1, 2, 2, 2],
+            "movieId": [1, 3, 6, 318, 6, 1704],
+            "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0],
+            "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"],
+        }
+    )
+else:
+    polars_missing_msg = "Polars is not available"
+    pytest.skip(reason=polars_missing_msg, allow_module_level=True)
+
+
+def test_join():
+    joined = join(left=main, right=main, left_on="movieId", right_on="movieId")
+    expected = main.join(main, on="movieId", how="left")
+    assert_frame_equal(joined, expected)
+
+
+def test_simple_agg():
+    aggregated = aggregate(
+        table=main,
+        key="movieId",
+        cols_to_agg="rating",
+        num_operations="mean",
+    )
+    aggfunc = pl.col("rating").mean().alias("rating_mean")
+    expected = main.group_by("movieId").agg(aggfunc)
+    # As group_by parallizes threads, the row order of its output isn't
+    # deterministic. Hence, we need to set check_row_order to False.
+    assert_frame_equal(aggregated, expected, check_row_order=False)
+
+
+def test_mode_agg():
+    aggregated = aggregate(
+        table=main,
+        key="movieId",
+        cols_to_agg="genre",
+        categ_operations=["mode"],
+    )
+    expected = pl.DataFrame(
+        {
+            "genre_mode": ["drama", "drama", "sf", "sf", "comedy"],
+            "movieId": [3, 1, 318, 1704, 6],
+        }
+    )
+    assert_frame_equal(aggregated, expected, check_row_order=False)
+
+
+def test_incorrect_dataframe_inputs():
+    with pytest.raises(TypeError, match=r"(?=.*polars dataframes)(?=.*pandas)"):
+        join(left=pd.DataFrame(main), right=main, left_on="movieId", right_on="movieId")
+
+    with pytest.raises(TypeError, match=r"(?=.*polars dataframe)(?=.*pandas)"):
+        aggregate(
+            table=pd.DataFrame(main),
+            key="movieId",
+            cols_to_agg="rating",
+            num_operations="mean",
+        )
+
+
+def test_make_dataframe():
+    X = dict(a=[1, 2], b=["z", "e"])
+    expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"]))
+    assert_frame_equal(make_dataframe(X), expected_df)
+
+    with pytest.raises(ValueError, match=r"(?=.*Polars dataframe)(?=.*index)"):
+        make_dataframe(X, index=[0, 1])
+
+
+def test_make_series():
+    X = [1, 2, 3]
+    expected_series = pl.Series(X)
+    assert_series_equal(make_series(X, index=None), expected_series)
+
+    with pytest.raises(ValueError, match=r"(?=.*Polars series)(?=.*index)"):
+        make_series(X, index=[0, 1])

From c00ef25f0a066aa8d4747fa532f54e8206edc4dd Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 2 Nov 2023 11:24:58 +0100
Subject: [PATCH 6/7] fix select cols

---
 skrub/_select_cols.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py
index 183562d27..8daade477 100644
--- a/skrub/_select_cols.py
+++ b/skrub/_select_cols.py
@@ -1,6 +1,6 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 
-from .dataframe import get_df_namespace
+from ._dataframe._namespace import get_df_namespace
 
 
 def _check_columns(df, columns):

From 94bc74480b622b2f80545b603a416a0892928567 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 2 Nov 2023 15:14:16 +0100
Subject: [PATCH 7/7] fix tests

---
 skrub/tests/test_select_cols.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/tests/test_select_cols.py b/skrub/tests/test_select_cols.py
index 0e36866be..3ab07e590 100644
--- a/skrub/tests/test_select_cols.py
+++ b/skrub/tests/test_select_cols.py
@@ -3,7 +3,7 @@
 import pytest
 
 from skrub import DropCols, SelectCols
-from skrub.dataframe import POLARS_SETUP
+from skrub._dataframe._polars import POLARS_SETUP
 
 DATAFRAME_MODULES = [pandas]
 if POLARS_SETUP: