From a06885362ce8d9b89441f4565a32d95ea6eaf100 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 16 Oct 2023 17:53:59 +0200 Subject: [PATCH 1/7] add make_series and make_dataframe --- skrub/dataframe/_pandas.py | 31 +++++++++++++++++ skrub/dataframe/_polars.py | 50 ++++++++++++++++++++++++++++ skrub/dataframe/tests/test_pandas.py | 20 +++++++++-- skrub/dataframe/tests/test_polars.py | 21 ++++++++++-- 4 files changed, 118 insertions(+), 4 deletions(-) diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py index f6aae26cd..69f9c306d 100644 --- a/skrub/dataframe/_pandas.py +++ b/skrub/dataframe/_pandas.py @@ -12,6 +12,37 @@ from skrub._utils import atleast_1d_or_none +def make_dataframe(X, index): + if not isinstance(X, dict) or not all( + (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1) + for X_col in X.values() + ): + raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.") + return pd.DataFrame(X, index=index) + + +def make_series(X, index=None, name=None): + """Convert an 1d array into a Pandas series. + + Parameters + ---------- + X : 1d iterable + Input data to convert. + + index : 1d array-like, default=None + The index of the series. + + name : str, default=None + The name of the series. + + Returns + ------- + X : Pandas series + Converted output. + """ + return pd.Series(X, index=index, name=name) + + def aggregate( table: pd.DataFrame, key: str | Iterable[str], diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py index a10546b7e..427aca15c 100644 --- a/skrub/dataframe/_polars.py +++ b/skrub/dataframe/_polars.py @@ -3,6 +3,8 @@ """ from typing import Iterable +import numpy as np + from skrub.dataframe._types import POLARS_SETUP, DataFrameLike if POLARS_SETUP: @@ -14,6 +16,54 @@ from skrub._utils import atleast_1d_or_none +def make_dataframe(X, index=None): + """Convert an dictionary of columns into a Polars dataframe. + + Parameters + ---------- + X : mapping from column name to 1d iterable + Input data to convert. + + index : 1d array-like, default=None + Unused since polars doesn't use index. + Only here for compatibility with Pandas. + + Returns + ------- + X : Polars dataframe + Converted output. + """ + if not isinstance(X, dict) or not all( + (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1) + for X_col in X.values() + ): + raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.") + return pl.DataFrame(X) + + +def make_series(X, index=None, name=None): + """Convert an 1d array into a Polars series. + + Parameters + ---------- + X : 1d iterable + Input data to convert. + + index : 1d array-like, default=None + Unused since polars doesn't use index. + Only here for compatibility with Pandas. + + name : str, default=None + The name of the series. + + Returns + ------- + X : Polars series + Converted output. + """ + return pl.Series(values=X, name=name) + + def aggregate( table: DataFrameLike, key: str | Iterable[str], diff --git a/skrub/dataframe/tests/test_pandas.py b/skrub/dataframe/tests/test_pandas.py index 067788540..1453dd230 100644 --- a/skrub/dataframe/tests/test_pandas.py +++ b/skrub/dataframe/tests/test_pandas.py @@ -1,8 +1,8 @@ import pandas as pd import pytest -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal -from skrub.dataframe._pandas import aggregate, join +from skrub.dataframe._pandas import aggregate, join, make_dataframe, make_series main = pd.DataFrame( { @@ -95,3 +95,19 @@ def test_no_agg_operation(): num_operations=None, categ_operations=None, ) + + +def test_make_dataframe(): + X = dict(a=[1, 2], b=["z", "e"]) + expected_df = pd.DataFrame(dict(a=[1, 2], b=["z", "e"])) + assert_frame_equal(make_dataframe(X, index=[0, 1]), expected_df) + + X = [[1, 2], ["z", "e"]] + with pytest.raises(TypeError): + make_dataframe(X) + + +def test_make_series(): + X = [1, 2, 3] + expected_series = pd.Series(X) + assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series) diff --git a/skrub/dataframe/tests/test_polars.py b/skrub/dataframe/tests/test_polars.py index 49ccee1d0..f982f72f0 100644 --- a/skrub/dataframe/tests/test_polars.py +++ b/skrub/dataframe/tests/test_polars.py @@ -2,11 +2,11 @@ import pytest from skrub.dataframe import POLARS_SETUP -from skrub.dataframe._polars import aggregate, join +from skrub.dataframe._polars import aggregate, join, make_dataframe, make_series if POLARS_SETUP: import polars as pl - from polars.testing import assert_frame_equal + from polars.testing import assert_frame_equal, assert_series_equal main = pl.DataFrame( { @@ -71,3 +71,20 @@ def test_incorrect_dataframe_inputs(): cols_to_agg="rating", num_operations="mean", ) + + +def test_make_dataframe(): + X = dict(a=[1, 2], b=["z", "e"]) + expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"])) + assert_frame_equal(make_dataframe(X, index=[1, 2]), expected_df) + + X = [[1, 2], ["z", "e"]] + with pytest.raises(TypeError): + make_dataframe(X) + + +@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG) +def test_make_series(): + X = [1, 2, 3] + expected_series = pl.Series(X) + assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series) From 7e1424da39d79fda91ee2dea30249fc379b27e92 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 16 Oct 2023 17:59:51 +0200 Subject: [PATCH 2/7] forgot make_dataframe for pandas docstring --- skrub/dataframe/_pandas.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py index 69f9c306d..d549e8104 100644 --- a/skrub/dataframe/_pandas.py +++ b/skrub/dataframe/_pandas.py @@ -13,6 +13,21 @@ def make_dataframe(X, index): + """Convert an dictionary of columns into a Pandas dataframe. + + Parameters + ---------- + X : mapping from column name to 1d iterable + Input data to convert. + + index : 1d array-like, default=None + The index of the dataframe. + + Returns + ------- + X : Pandas dataframe + Converted output. + """ if not isinstance(X, dict) or not all( (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1) for X_col in X.values() From e5e00d0b9ee67752ef489777f243c1ffe8ee0741 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 31 Oct 2023 16:43:41 +0100 Subject: [PATCH 3/7] apply suggestions --- skrub/dataframe/_pandas.py | 2 +- skrub/dataframe/_polars.py | 10 ++++++++++ skrub/dataframe/tests/test_polars.py | 20 +++++++++++--------- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py index d549e8104..b3be557ec 100644 --- a/skrub/dataframe/_pandas.py +++ b/skrub/dataframe/_pandas.py @@ -12,7 +12,7 @@ from skrub._utils import atleast_1d_or_none -def make_dataframe(X, index): +def make_dataframe(X, index=None): """Convert an dictionary of columns into a Pandas dataframe. Parameters diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py index 427aca15c..bf4e30f7f 100644 --- a/skrub/dataframe/_polars.py +++ b/skrub/dataframe/_polars.py @@ -33,6 +33,11 @@ def make_dataframe(X, index=None): X : Polars dataframe Converted output. """ + if index is not None: + raise ValueError( + "Polars dataframes don't have an index, but " + f"the Polars dataframe maker was called with {index=!r}." + ) if not isinstance(X, dict) or not all( (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1) for X_col in X.values() @@ -61,6 +66,11 @@ def make_series(X, index=None, name=None): X : Polars series Converted output. """ + if index is not None: + raise ValueError( + "Polars series don't have an index, but " + f"the Polars series maker was called with {index=!r}." + ) return pl.Series(values=X, name=name) diff --git a/skrub/dataframe/tests/test_polars.py b/skrub/dataframe/tests/test_polars.py index f982f72f0..c9f8bc4bd 100644 --- a/skrub/dataframe/tests/test_polars.py +++ b/skrub/dataframe/tests/test_polars.py @@ -16,18 +16,17 @@ "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], } ) +else: + polars_missing_msg = "Polars is not available" + pytest.skip(reason=polars_missing_msg, allow_module_level=True) -POLARS_MISSING_MSG = "Polars is not available" - -@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG) def test_join(): joined = join(left=main, right=main, left_on="movieId", right_on="movieId") expected = main.join(main, on="movieId", how="left") assert_frame_equal(joined, expected) -@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG) def test_simple_agg(): aggregated = aggregate( table=main, @@ -42,7 +41,6 @@ def test_simple_agg(): assert_frame_equal(aggregated, expected, check_row_order=False) -@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG) def test_mode_agg(): aggregated = aggregate( table=main, @@ -59,7 +57,6 @@ def test_mode_agg(): assert_frame_equal(aggregated, expected, check_row_order=False) -@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG) def test_incorrect_dataframe_inputs(): with pytest.raises(TypeError, match=r"(?=.*polars dataframes)(?=.*pandas)"): join(left=pd.DataFrame(main), right=main, left_on="movieId", right_on="movieId") @@ -76,15 +73,20 @@ def test_incorrect_dataframe_inputs(): def test_make_dataframe(): X = dict(a=[1, 2], b=["z", "e"]) expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"])) - assert_frame_equal(make_dataframe(X, index=[1, 2]), expected_df) + assert_frame_equal(make_dataframe(X), expected_df) X = [[1, 2], ["z", "e"]] with pytest.raises(TypeError): make_dataframe(X) + with pytest.raises(ValueError, match=r"(?=.*Polars dataframe)(?=.*index)"): + make_dataframe(X, index=[0, 1]) + -@pytest.mark.skipif(not POLARS_SETUP, reason=POLARS_MISSING_MSG) def test_make_series(): X = [1, 2, 3] expected_series = pl.Series(X) - assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series) + assert_series_equal(make_series(X, index=None), expected_series) + + with pytest.raises(ValueError, match=r"(?=.*Polars series)(?=.*index)"): + make_series(X, index=[0, 1]) From 386ba6a7b807a05ca32b68f6c480ee28a8fb011d Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 31 Oct 2023 18:00:40 +0100 Subject: [PATCH 4/7] from dataframe to _dataframe --- doc/api.rst | 40 --- skrub/_agg_joiner.py | 56 ++-- skrub/dataframe/__init__.py | 19 -- skrub/dataframe/_namespace.py | 103 ------- skrub/dataframe/_pandas.py | 339 ------------------------ skrub/dataframe/_polars.py | 278 ------------------- skrub/dataframe/_types.py | 14 - skrub/dataframe/tests/__init__.py | 0 skrub/dataframe/tests/test_namespace.py | 41 --- skrub/dataframe/tests/test_pandas.py | 113 -------- skrub/dataframe/tests/test_polars.py | 92 ------- skrub/tests/test_agg_joiner.py | 2 +- 12 files changed, 20 insertions(+), 1077 deletions(-) delete mode 100644 skrub/dataframe/__init__.py delete mode 100644 skrub/dataframe/_namespace.py delete mode 100644 skrub/dataframe/_pandas.py delete mode 100644 skrub/dataframe/_polars.py delete mode 100644 skrub/dataframe/_types.py delete mode 100644 skrub/dataframe/tests/__init__.py delete mode 100644 skrub/dataframe/tests/test_namespace.py delete mode 100644 skrub/dataframe/tests/test_pandas.py delete mode 100644 skrub/dataframe/tests/test_polars.py diff --git a/doc/api.rst b/doc/api.rst index 3604abc9b..df1a0a393 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -87,46 +87,6 @@ This page lists all available functions and classes of `skrub`. deduplicate -.. raw:: html - -

Dataframes operations

- -.. autosummary:: - :toctree: generated/ - :template: function.rst - :nosignatures: - :caption: DataFrames operations - - dataframe.get_df_namespace - -.. raw:: html - -

Pandas

- -.. autosummary:: - :toctree: generated/ - :template: function.rst - :nosignatures: - :caption: Pandas operations - - dataframe.is_pandas - dataframe.pd_aggregate - dataframe.pd_join - -.. raw:: html - -

Polars

- -.. autosummary:: - :toctree: generated/ - :template: function.rst - :nosignatures: - :caption: Polars operations - - dataframe.is_polars - dataframe.pl_aggregate - dataframe.pl_join - .. raw:: html

Data download and generation

diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index 7e3e3fb2b..effc833dc 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -9,15 +9,13 @@ from typing import Iterable import numpy as np -from numpy.typing import ArrayLike from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_is_fitted +from skrub._dataframe._namespace import get_df_namespace +from skrub._dataframe._pandas import _parse_argument from skrub._utils import atleast_1d_or_none, atleast_2d_or_none -from skrub.dataframe import DataFrameLike, SeriesLike -from skrub.dataframe._namespace import get_df_namespace -from skrub.dataframe._pandas import _parse_argument NUM_OPERATIONS = ["sum", "mean", "std", "min", "max", "hist", "value_counts"] CATEG_OPERATIONS = ["mode", "count", "value_counts"] @@ -52,10 +50,10 @@ def split_num_categ_operations(operations: list[str]) -> tuple[list[str], list[s def check_missing_columns( - X: DataFrameLike, - columns: list[str], - error_msg: str, -) -> None: + X, + columns, + error_msg, +): """All elements of main_key must belong to the columns of X. Parameters @@ -161,13 +159,13 @@ class AggJoiner(BaseEstimator, TransformerMixin): def __init__( self, - aux_table: DataFrameLike | Iterable[DataFrameLike] | str | Iterable[str], + aux_table, *, - aux_key: str | Iterable[str], - main_key: str | Iterable[str], - cols: str | Iterable[str] | None = None, - operation: str | Iterable[str] | None = None, - suffix: str | Iterable[str] | None = None, + aux_key, + main_key, + cols=None, + operation=None, + suffix=None, ): self.aux_table = aux_table self.aux_key = aux_key @@ -176,11 +174,7 @@ def __init__( self.operation = operation self.suffix = suffix - def fit( - self, - X: DataFrameLike, - y: ArrayLike | SeriesLike | None = None, - ) -> "AggJoiner": + def fit(self, X, y=None): """Aggregate auxiliary tables based on the main keys. Parameters @@ -221,7 +215,7 @@ def fit( return self - def transform(self, X: DataFrameLike) -> DataFrameLike: + def transform(self, X): """Left-join pre-aggregated tables on `X`. Parameters @@ -248,18 +242,14 @@ def transform(self, X: DataFrameLike) -> DataFrameLike: return X - def _screen( - self, - aux_table: DataFrameLike, - y: DataFrameLike | SeriesLike | ArrayLike, - ) -> DataFrameLike: + def _screen(self, aux_table, y): """Only keep aggregated features which correlation with y is above some threshold. """ # TODO: Add logic return aux_table - def check_input(self, X: DataFrameLike) -> None: + def check_input(self, X): """Perform a check on column names data type and suffixes. Parameters @@ -452,11 +442,7 @@ def __init__( self.operation = operation self.suffix = suffix - def fit( - self, - X: DataFrameLike, - y: DataFrameLike | SeriesLike | ArrayLike, - ) -> "AggTarget": + def fit(self, X, y): """Aggregate the target ``y`` based on keys from ``X``. Parameters @@ -501,7 +487,7 @@ def fit( return self - def transform(self, X: DataFrameLike) -> DataFrameLike: + def transform(self, X): """Left-join pre-aggregated tables on `X`. Parameters @@ -524,11 +510,7 @@ def transform(self, X: DataFrameLike) -> DataFrameLike: right_on=self.main_key_, ) - def check_input( - self, - X: DataFrameLike, - y: DataFrameLike | SeriesLike | ArrayLike, - ) -> DataFrameLike: + def check_input(self, X, y): """Perform a check on column names data type and suffixes. Parameters diff --git a/skrub/dataframe/__init__.py b/skrub/dataframe/__init__.py deleted file mode 100644 index a89e910e6..000000000 --- a/skrub/dataframe/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from skrub.dataframe._namespace import get_df_namespace, is_pandas, is_polars -from skrub.dataframe._pandas import aggregate as pd_aggregate -from skrub.dataframe._pandas import join as pd_join -from skrub.dataframe._polars import aggregate as pl_aggregate -from skrub.dataframe._polars import join as pl_join -from skrub.dataframe._types import POLARS_SETUP, DataFrameLike, SeriesLike - -__all__ = [ - POLARS_SETUP, - DataFrameLike, - SeriesLike, - get_df_namespace, - is_pandas, - is_polars, - pd_join, - pd_aggregate, - pl_join, - pl_aggregate, -] diff --git a/skrub/dataframe/_namespace.py b/skrub/dataframe/_namespace.py deleted file mode 100644 index ebd6516b1..000000000 --- a/skrub/dataframe/_namespace.py +++ /dev/null @@ -1,103 +0,0 @@ -import sys -from types import ModuleType - -import pandas as pd - -import skrub.dataframe._pandas as skrub_pd -import skrub.dataframe._polars as skrub_pl -from skrub.dataframe._types import DataFrameLike - - -def is_pandas(dataframe: DataFrameLike) -> bool: - """Check whether the input is a Pandas dataframe. - - Parameters - ---------- - dataframe : DataFrameLike - The input dataframe - - Returns - ------- - is_pandas : bool - Whether the dataframe is a Pandas dataframe or not. - """ - return isinstance(dataframe, pd.DataFrame) - - -def is_polars(dataframe: DataFrameLike) -> bool: - """Check whether the input is a Polars dataframe or lazyframe. - - Parameters - ---------- - dataframe : DataFrameLike - The input dataframe - - Returns - ------- - is_polars : bool - Whether the dataframe is a Polars dataframe/lazyframe or not. - """ - if "polars" not in sys.modules: - return False - - import polars as pl - - return isinstance(dataframe, (pl.DataFrame, pl.LazyFrame)) - - -def get_df_namespace( - *dfs: DataFrameLike | list[DataFrameLike], -) -> tuple[ModuleType, ModuleType]: - """Get the namespaces of dataframes. - - Introspects dataframes and returns their skrub namespace object - ``skrub.dataframe._{pandas, polars}`` and the dataframe module - ``{polars, pandas}`` itself. - - The dataframes passed in input need to come from the same module, otherwise a - ``TypeError`` will be raised. - - The outputs of this function are denoted ``skrub_px`` and ``px`` in reference to - the array API, returning namespace (NumPy, PyTorch and CuPy) as ``nx``. - Since we deal with Polars (``pl``) and Pandas (``pd``), we use ``px`` - as a variable name. - - Parameters - ---------- - dfs : DataFrameLike | list[DataFrameLike], - The dataframes to extract modules from. - - Returns - ------- - skrub_px : ModuleType - Skrub namespace shared by dataframe objects. - - px : ModuleType - Dataframe namespace, i.e. Pandas or Polars module. - """ - # FIXME Pandas and Polars series will raise errors. - if all([is_pandas(df) for df in dfs]): - return skrub_pd, pd - - elif all([is_polars(df) for df in dfs]): - import polars as pl - - if all([isinstance(df, pl.DataFrame) for df in dfs]) or all( - [isinstance(df, pl.LazyFrame) for df in dfs] - ): - return skrub_pl, pl - else: - raise TypeError("Mixing Polars lazyframes and dataframes is not supported.") - - else: - modules = [type(df).__module__ for df in dfs] - if all([is_polars(df) or is_pandas(df) for df in dfs]): - raise TypeError( - "Mixing Pandas and Polars dataframes is not supported, " - f"got {modules=!r}." - ) - else: - raise TypeError( - "Only Pandas or Polars dataframes are currently supported, " - f"got {modules=!r}." - ) diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py deleted file mode 100644 index b3be557ec..000000000 --- a/skrub/dataframe/_pandas.py +++ /dev/null @@ -1,339 +0,0 @@ -""" -Pandas specialization of the aggregate and join operation. -""" -import re -from collections.abc import Callable -from itertools import product -from typing import Iterable - -import numpy as np -import pandas as pd - -from skrub._utils import atleast_1d_or_none - - -def make_dataframe(X, index=None): - """Convert an dictionary of columns into a Pandas dataframe. - - Parameters - ---------- - X : mapping from column name to 1d iterable - Input data to convert. - - index : 1d array-like, default=None - The index of the dataframe. - - Returns - ------- - X : Pandas dataframe - Converted output. - """ - if not isinstance(X, dict) or not all( - (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1) - for X_col in X.values() - ): - raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.") - return pd.DataFrame(X, index=index) - - -def make_series(X, index=None, name=None): - """Convert an 1d array into a Pandas series. - - Parameters - ---------- - X : 1d iterable - Input data to convert. - - index : 1d array-like, default=None - The index of the series. - - name : str, default=None - The name of the series. - - Returns - ------- - X : Pandas series - Converted output. - """ - return pd.Series(X, index=index, name=name) - - -def aggregate( - table: pd.DataFrame, - key: str | Iterable[str], - cols_to_agg: str | Iterable[str], - num_operations: str | Iterable[str] = ("mean",), - categ_operations: str | Iterable[str] = ("mode",), - suffix: str | None = None, -) -> pd.DataFrame: - """Aggregates a :obj:`pandas.DataFrame`. - - This function uses the ``dataframe.groupby(key).agg`` method from Pandas. - - Parameters - ---------- - table : pd.DataFrame, - The input dataframe to aggregate. - - key : str or Iterable[str], - The columns used as keys to aggregate on. - - cols_to_agg : str or Iterable[str], - The columns to aggregate. - - num_operations : str or Iterable[str], - The reduction functions to apply on numerical columns - in ``cols_to_agg`` during the aggregation. - - categ_operations : str or Iterable[str], - The reduction functions to apply on categorical columns - in ``cols_to_agg`` during the aggregation. - - suffix : str, optional - The suffix appended to output columns. - - Returns - ------- - group : pd.DataFrame, - The aggregated output. - """ - if not isinstance(table, pd.DataFrame): - raise TypeError(f"'table' must be a pandas dataframe, got {type(table)!r}.") - - key = atleast_1d_or_none(key) - cols_to_agg = atleast_1d_or_none(cols_to_agg) - num_operations = atleast_1d_or_none(num_operations) - categ_operations = atleast_1d_or_none(categ_operations) - suffix = "" if suffix is None else suffix - - num_cols, categ_cols = split_num_categ_cols(table[cols_to_agg]) - - num_named_agg, num_value_counts = get_named_agg(table, num_cols, num_operations) - categ_named_agg, categ_value_counts = get_named_agg( - table, categ_cols, categ_operations - ) - - named_agg = {**num_named_agg, **categ_named_agg} - if named_agg: - base_group = table.groupby(key).agg(**named_agg) - else: - base_group = None - - # 'histogram' and 'value_counts' requires a pivot - value_counts = {**num_value_counts, **categ_value_counts} - for output_key, (col_to_agg, kwargs) in value_counts.items(): - serie_group = table.groupby(key)[col_to_agg].value_counts(**kwargs) - serie_group.name = output_key - pivot = ( - serie_group.reset_index() - .pivot(index=key, columns=col_to_agg) - .reset_index() - .fillna(0) - ) - cols = pivot.columns.droplevel(0) - index_cols = np.atleast_1d(key).tolist() - feature_cols = (f"{col_to_agg}_" + cols[len(index_cols) :].astype(str)).tolist() - cols = [*index_cols, *feature_cols] - pivot.columns = cols - - if base_group is None: - base_group = pivot - else: - base_group = base_group.merge(pivot, on=key, how="left") - - if base_group is None: - raise ValueError("No aggregation to perform.") - - base_group.columns = [ - f"{col}{suffix}" if col not in key else col for col in base_group.columns - ] - sorted_cols = sorted(base_group.columns) - - return base_group[sorted_cols] - - -def join( - left: pd.DataFrame, - right: pd.DataFrame, - left_on: str | Iterable[str], - right_on: str | Iterable[str], -) -> pd.DataFrame: - """Left join two :obj:`pandas.DataFrame`. - - This function uses the ``dataframe.merge`` method from Pandas. - - Parameters - ---------- - left : pd.DataFrame, - The left dataframe to left-join. - - right : pd.DataFrame, - The right dataframe to left-join. - - left_on : str or Iterable[str] - Left keys to merge on. - - right_on : str or Iterable[str] - Right keys to merge on. - - Returns - ------- - merged : pd.DataFrame, - The merged output. - """ - if not (isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame)): - raise TypeError( - "'left' and 'right' must be pandas dataframes, " - f"got {type(left)!r} and {type(right)!r}." - ) - return left.merge( - right, - how="left", - left_on=left_on, - right_on=right_on, - ) - - -def get_named_agg( - table: pd.DataFrame, cols: list[str], operations: list[str] -) -> tuple[dict, dict]: - """Map aggregation tuples to their output key. - - The dictionary has the form: output_key = (column, aggfunc). - This is used as input for the ``dataframe.agg`` method from Pandas. - - 'value_counts' and 'hist' operation require to pivot - the tables and treated in a separate mapping. - - Parameters - ---------- - table : pd.DataFrame, - Input dataframe, only used to compute bins values if - 'value_counts' or 'hist' are operations. - - cols : list, - The columns to aggregate. - - operations : list, - The reduce operations to perform. - - Returns - ------- - named_agg : dict, - Named aggregation mapping. - - value_counts : dict, - ``value_counts`` operations mapping. - """ - named_agg, value_counts = {}, {} - for col, operation in product(cols, operations): - op_root, bin_args = _parse_argument(operation) - aggfunc, bin_args = _get_aggfunc(table[col], op_root, bin_args) - - output_key = f"{col}_{op_root}" - # 'value_counts' change the index of the resulting frame - # and must be treated separately. - if aggfunc == "value_counts": - value_counts[output_key] = (col, bin_args) - else: - named_agg[output_key] = (col, aggfunc) - - return named_agg, value_counts - - -def _parse_argument(operation: str) -> tuple[str, int | None]: - """Split a text input into a function name and its argument. - - Parameters - ---------- - operation : str, - The operation to parse. - - Returns - ------- - operation_root : str, - The name of the operation before parenthesis, if any. - - bin_args : int, - The number of bin to create for ``hist`` or ``value_counts``. - - Examples - -------- - >>> _parse_argument("hist(10)") - ('hist', 10) - """ - split = re.split("\\(.+\\)", operation) - op_root = split[0] - if len(split) > 1: - # remove op_root - bin_args = re.split(f"^{op_root}", operation) - bin_args = bin_args[1] - # remove parenthesis - bin_args = re.sub("\\(|\\)", "", bin_args) - bin_args = int(bin_args) - return op_root, bin_args - else: - return op_root, None - - -PANDAS_OPS_MAPPING = { - "mode": pd.Series.mode, - "quantile": pd.Series.quantile, - "hist": "value_counts", -} - - -def _get_aggfunc( - serie: pd.Series, op_root: str, n_bins: int -) -> tuple[str | Callable, dict]: - """Map operation roots to their pandas agg functions. - - When args is provided for histogram or value_counts, - we create args - - Parameters - ---------- - serie : pd.Series, - Input series, used to compute the bins if n_bins is provided. - - op_root : str, - Operation root, the operation without the bin argument, if any. - - n_bins : int, - The number of bin to create when value_counts or hist operation are used. - - Returns - ------- - aggfunc : str or callable, - The pandas agg functions to perform - - bins_args : dict, - The bins to create when using value_counts or hist. - """ - aggfunc = PANDAS_OPS_MAPPING.get(op_root, op_root) - - if n_bins is not None: - # histogram and value_counts - if aggfunc == "value_counts": - # If bins is a number, we need to set a fix bin range, - # otherwise bins edges will be defined dynamically for - # each rows. - min_, max_ = serie.min(), serie.max() - bins = np.linspace(min_, max_, n_bins + 1) - bins_args = dict(bins=bins) - else: - raise ValueError( - f"Operator {op_root!r} doesn't take any argument, got {n_bins!r}" - ) - else: - bins_args = {} - - return aggfunc, bins_args - - -def split_num_categ_cols(table): - """Split dataframe columns between numerical and categorical.""" - num_cols = table.select_dtypes("number").columns - categ_cols = table.select_dtypes(["object", "string", "category"]).columns - - return num_cols, categ_cols diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py deleted file mode 100644 index bf4e30f7f..000000000 --- a/skrub/dataframe/_polars.py +++ /dev/null @@ -1,278 +0,0 @@ -""" -Polars specialization of the aggregate and join operations. -""" -from typing import Iterable - -import numpy as np - -from skrub.dataframe._types import POLARS_SETUP, DataFrameLike - -if POLARS_SETUP: - import polars as pl - import polars.selectors as cs - -from itertools import product - -from skrub._utils import atleast_1d_or_none - - -def make_dataframe(X, index=None): - """Convert an dictionary of columns into a Polars dataframe. - - Parameters - ---------- - X : mapping from column name to 1d iterable - Input data to convert. - - index : 1d array-like, default=None - Unused since polars doesn't use index. - Only here for compatibility with Pandas. - - Returns - ------- - X : Polars dataframe - Converted output. - """ - if index is not None: - raise ValueError( - "Polars dataframes don't have an index, but " - f"the Polars dataframe maker was called with {index=!r}." - ) - if not isinstance(X, dict) or not all( - (isinstance(X_col, Iterable) and np.asarray(X_col).ndim == 1) - for X_col in X.values() - ): - raise TypeError(f"X must be a dictionary of 1d array. Got {X=!r}.") - return pl.DataFrame(X) - - -def make_series(X, index=None, name=None): - """Convert an 1d array into a Polars series. - - Parameters - ---------- - X : 1d iterable - Input data to convert. - - index : 1d array-like, default=None - Unused since polars doesn't use index. - Only here for compatibility with Pandas. - - name : str, default=None - The name of the series. - - Returns - ------- - X : Polars series - Converted output. - """ - if index is not None: - raise ValueError( - "Polars series don't have an index, but " - f"the Polars series maker was called with {index=!r}." - ) - return pl.Series(values=X, name=name) - - -def aggregate( - table: DataFrameLike, - key: str | Iterable[str], - cols_to_agg: str | Iterable[str], - num_operations: str | Iterable[str] = ("mean",), - categ_operations: str | Iterable[str] = ("mode",), - suffix: str | None = None, -) -> DataFrameLike: - """Aggregate a :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`. - - This function uses the ``dataframe.group_by(key).agg`` method from Polars. - - Parameters - ---------- - table : pl.DataFrame or pl.LazyFrame, - The input dataframe to aggregate. - - key : str or Iterable[str], - The columns used as keys to aggregate on. - - cols_to_agg : str or Iterable[str], - The columns to aggregate. - - num_operations : str or Iterable[str], - The reduction functions to apply on numerical columns - in ``cols_to_agg`` during the aggregation. - - categ_operations : str or Iterable[str], - The reduction functions to apply on categorical columns - in ``cols_to_agg`` during the aggregation. - - suffix : str, - The suffix appended to output columns. - - Returns - ------- - group : pl.DataFrame or pl.LazyFrame, - The aggregated output. - """ - if not isinstance(table, (pl.DataFrame, pl.LazyFrame)): - raise TypeError( - f"'table' must be a polars dataframe or lazyframe, got {type(table)!r}." - ) - - key = atleast_1d_or_none(key) - cols_to_agg = atleast_1d_or_none(cols_to_agg) - num_operations = atleast_1d_or_none(num_operations) - categ_operations = atleast_1d_or_none(categ_operations) - suffix = "" if suffix is None else suffix - - num_cols, categ_cols = split_num_categ_cols(table.select(cols_to_agg)) - - num_aggfuncs, num_mode_cols = get_aggfuncs(num_cols, num_operations) - categ_aggfuncs, categ_mode_cols = get_aggfuncs(categ_cols, categ_operations) - - aggfuncs = [*num_aggfuncs, *categ_aggfuncs] - # If aggfuncs is empty, the output will be a series of index. - table = table.group_by(key).agg(aggfuncs) - - # flattening post-processing of mode() cols - flatten_ops = [] - for col in [*num_mode_cols, *categ_mode_cols]: - flatten_ops.append(pl.col(col).list[0].alias(col)) - # add columns, no-op if 'flatten_ops' is empty. - table = table.with_columns(flatten_ops) - - cols_renaming = {col: f"{col}{suffix}" for col in table.columns if col not in key} - table = table.rename(cols_renaming) - sorted_cols = sorted(table.columns) - - return table.select(sorted_cols) - - -def join( - left: DataFrameLike, - right: DataFrameLike, - left_on: str | Iterable[str], - right_on: str | Iterable[str], -) -> DataFrameLike: - """Left join two :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`. - - This function uses the ``dataframe.join`` method from Polars. - - Note that the input dataframes type must agree: either both - Polars dataframes or both Polars lazyframes. - - Mixing polars dataframe with lazyframe will raise an error. - - Parameters - ---------- - left : pl.DataFrame or pl.LazyFrame, - The left dataframe of the left-join. - - right : pl.DataFrame or pl.LazyFrame, - The right dataframe of the left-join. - - left_on : str or Iterable[str], - Left keys to merge on. - - right_on : str or Iterable[str], - Right keys to merge on. - - Returns - ------- - merged : pl.DataFrame or pl.LazyFrame, - The merged output. - """ - is_dataframe = isinstance(left, pl.DataFrame) and isinstance(right, pl.DataFrame) - is_lazyframe = isinstance(left, pl.LazyFrame) and isinstance(right, pl.LazyFrame) - if is_dataframe or is_lazyframe: - return left.join( - right, - how="left", - left_on=left_on, - right_on=right_on, - ) - else: - raise TypeError( - "'left' and 'right' must be polars dataframes or lazyframes, " - f"got {type(left)!r} and {type(right)!r}." - ) - - -def get_aggfuncs( - cols: list[str], - operations: list[str], -) -> tuple[list, list]: - """List Polars aggregation functions. - - The list is used as input for the ``dataframe.group_by().agg()`` method from Polars. - The 'mode' operation needs a flattening post-processing. - - Parameters - ---------- - cols : list, - The columns to aggregate. - - operations : list, - The reduce operations to perform. - - Returns - ------- - aggfuncs : list, - Named aggregation list. - - mode_cols : list, - Output keys to post-process after 'mode' aggregation. - """ - aggfuncs, mode_cols = [], [] - for col, operation in product(cols, operations): - output_key = f"{col}_{operation}" - aggfunc = _polars_ops_mapping(col, operation, output_key) - aggfuncs.append(aggfunc) - - if operation == "mode": - mode_cols.append(output_key) - - return aggfuncs, mode_cols - - -def _polars_ops_mapping(col, operation, output_key): - """Map an operation to its Polars expression. - - Parameters - ---------- - col : str, - Name of the column to aggregate. - operation : str, - Name of the reduce function. - output_key : str, - Name of the reduced column. - - Returns - ------- - aggfunc: polars.Expression, - The expression to apply. - """ - polars_aggfuncs = { - "mean": pl.col(col).mean(), - "std": pl.col(col).std(), - "sum": pl.col(col).sum(), - "min": pl.col(col).min(), - "max": pl.col(col).max(), - "mode": pl.col(col).mode(), - } - aggfunc = polars_aggfuncs.get(operation, None) - - if aggfunc is None: - raise ValueError( - f"Polars operation {operation!r} is not supported. Available:" - f" {list(polars_aggfuncs)}" - ) - - return aggfunc.alias(output_key) - - -def split_num_categ_cols(table): - """Split a dataframe columns between numerical and categorical.""" - num_cols = table.select(cs.numeric()).columns - categ_cols = table.select(cs.string()).columns - - return num_cols, categ_cols diff --git a/skrub/dataframe/_types.py b/skrub/dataframe/_types.py deleted file mode 100644 index 0468a613c..000000000 --- a/skrub/dataframe/_types.py +++ /dev/null @@ -1,14 +0,0 @@ -import pandas as pd - -try: - import polars as pl - - POLARS_SETUP = True -except ImportError: - POLARS_SETUP = False - -DataFrameLike = pd.DataFrame -SeriesLike = pd.Series -if POLARS_SETUP: - DataFrameLike |= pl.DataFrame | pl.LazyFrame - SeriesLike |= pl.Series diff --git a/skrub/dataframe/tests/__init__.py b/skrub/dataframe/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/skrub/dataframe/tests/test_namespace.py b/skrub/dataframe/tests/test_namespace.py deleted file mode 100644 index bccdbabe9..000000000 --- a/skrub/dataframe/tests/test_namespace.py +++ /dev/null @@ -1,41 +0,0 @@ -import pandas as pd -import pytest - -import skrub.dataframe._pandas as skrub_pd -import skrub.dataframe._polars as skrub_pl -from skrub.dataframe import POLARS_SETUP, get_df_namespace - -main = pd.DataFrame( - { - "userId": [1, 1, 1, 2, 2, 2], - "movieId": [1, 3, 6, 318, 6, 1704], - "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], - "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], - } -) - - -def test_get_namespace_pandas(): - skrub_px, px = get_df_namespace(main, main) - assert skrub_px is skrub_pd - assert px is pd - - with pytest.raises(TypeError, match=r"(?=.*Only Pandas or Polars)(?=.*supported)"): - get_df_namespace(main, main.values) - - -@pytest.mark.skipif(not POLARS_SETUP, reason="Polars is not available") -def test_get_namespace_polars(): - import polars as pl - - skrub_px, px = get_df_namespace(pl.DataFrame(main), pl.DataFrame(main)) - assert skrub_px is skrub_pl - assert px is pl - - with pytest.raises(TypeError, match=r"(?=.*Mixing Pandas)(?=.*Polars)"): - get_df_namespace(main, pl.DataFrame(main)) - - with pytest.raises( - TypeError, match=r"(?=.*Mixing)(?=.*lazyframes)(?=.*dataframes)" - ): - get_df_namespace(pl.DataFrame(main), pl.LazyFrame(main)) diff --git a/skrub/dataframe/tests/test_pandas.py b/skrub/dataframe/tests/test_pandas.py deleted file mode 100644 index 1453dd230..000000000 --- a/skrub/dataframe/tests/test_pandas.py +++ /dev/null @@ -1,113 +0,0 @@ -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal, assert_series_equal - -from skrub.dataframe._pandas import aggregate, join, make_dataframe, make_series - -main = pd.DataFrame( - { - "userId": [1, 1, 1, 2, 2, 2], - "movieId": [1, 3, 6, 318, 6, 1704], - "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], - "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], - } -) - - -def test_join(): - joined = join(left=main, right=main, left_on="movieId", right_on="movieId") - expected = main.merge(main, on="movieId", how="left") - assert_frame_equal(joined, expected) - - -def test_simple_agg(): - aggregated = aggregate( - table=main, - key="movieId", - cols_to_agg=["rating", "genre"], - num_operations="mean", - categ_operations="mode", - ) - aggfunc = { - "genre_mode": ("genre", pd.Series.mode), - "rating_mean": ("rating", "mean"), - } - expected = main.groupby("movieId").agg(**aggfunc) - assert_frame_equal(aggregated, expected) - - -def test_value_counts_agg(): - aggregated = aggregate( - table=main, - key="userId", - cols_to_agg="rating", - num_operations="value_counts", - categ_operations=None, - suffix="_user", - ) - expected = pd.DataFrame( - { - "rating_2.0_user": [0.0, 1.0], - "rating_3.0_user": [0.0, 1.0], - "rating_4.0_user": [3.0, 1.0], - "userId": [1, 2], - } - ) - assert_frame_equal(aggregated, expected) - - aggregated = aggregate( - table=main, - key="userId", - cols_to_agg="rating", - num_operations="hist(2)", - categ_operations=None, - suffix="_user", - ) - expected = pd.DataFrame( - { - "rating_(1.999, 3.0]_user": [0, 2], - "rating_(3.0, 4.0]_user": [3, 1], - "userId": [1, 2], - } - ) - assert_frame_equal(aggregated, expected) - - -def test_incorrect_dataframe_inputs(): - with pytest.raises(TypeError, match=r"(?=.*pandas dataframes)(?=.*array)"): - join(left=main.values, right=main, left_on="movieId", right_on="movieId") - - with pytest.raises(TypeError, match=r"(?=.*pandas dataframe)(?=.*array)"): - aggregate( - table=main.values, - key="movieId", - cols_to_agg="rating", - num_operations="mean", - ) - - -def test_no_agg_operation(): - with pytest.raises(ValueError, match=r"(?=.*No aggregation)"): - aggregate( - table=main, - key="movieId", - cols_to_agg="rating", - num_operations=None, - categ_operations=None, - ) - - -def test_make_dataframe(): - X = dict(a=[1, 2], b=["z", "e"]) - expected_df = pd.DataFrame(dict(a=[1, 2], b=["z", "e"])) - assert_frame_equal(make_dataframe(X, index=[0, 1]), expected_df) - - X = [[1, 2], ["z", "e"]] - with pytest.raises(TypeError): - make_dataframe(X) - - -def test_make_series(): - X = [1, 2, 3] - expected_series = pd.Series(X) - assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series) diff --git a/skrub/dataframe/tests/test_polars.py b/skrub/dataframe/tests/test_polars.py deleted file mode 100644 index c9f8bc4bd..000000000 --- a/skrub/dataframe/tests/test_polars.py +++ /dev/null @@ -1,92 +0,0 @@ -import pandas as pd -import pytest - -from skrub.dataframe import POLARS_SETUP -from skrub.dataframe._polars import aggregate, join, make_dataframe, make_series - -if POLARS_SETUP: - import polars as pl - from polars.testing import assert_frame_equal, assert_series_equal - - main = pl.DataFrame( - { - "userId": [1, 1, 1, 2, 2, 2], - "movieId": [1, 3, 6, 318, 6, 1704], - "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], - "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], - } - ) -else: - polars_missing_msg = "Polars is not available" - pytest.skip(reason=polars_missing_msg, allow_module_level=True) - - -def test_join(): - joined = join(left=main, right=main, left_on="movieId", right_on="movieId") - expected = main.join(main, on="movieId", how="left") - assert_frame_equal(joined, expected) - - -def test_simple_agg(): - aggregated = aggregate( - table=main, - key="movieId", - cols_to_agg="rating", - num_operations="mean", - ) - aggfunc = pl.col("rating").mean().alias("rating_mean") - expected = main.group_by("movieId").agg(aggfunc) - # As group_by parallizes threads, the row order of its output isn't - # deterministic. Hence, we need to set check_row_order to False. - assert_frame_equal(aggregated, expected, check_row_order=False) - - -def test_mode_agg(): - aggregated = aggregate( - table=main, - key="movieId", - cols_to_agg="genre", - categ_operations=["mode"], - ) - expected = pl.DataFrame( - { - "genre_mode": ["drama", "drama", "sf", "sf", "comedy"], - "movieId": [3, 1, 318, 1704, 6], - } - ) - assert_frame_equal(aggregated, expected, check_row_order=False) - - -def test_incorrect_dataframe_inputs(): - with pytest.raises(TypeError, match=r"(?=.*polars dataframes)(?=.*pandas)"): - join(left=pd.DataFrame(main), right=main, left_on="movieId", right_on="movieId") - - with pytest.raises(TypeError, match=r"(?=.*polars dataframe)(?=.*pandas)"): - aggregate( - table=pd.DataFrame(main), - key="movieId", - cols_to_agg="rating", - num_operations="mean", - ) - - -def test_make_dataframe(): - X = dict(a=[1, 2], b=["z", "e"]) - expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"])) - assert_frame_equal(make_dataframe(X), expected_df) - - X = [[1, 2], ["z", "e"]] - with pytest.raises(TypeError): - make_dataframe(X) - - with pytest.raises(ValueError, match=r"(?=.*Polars dataframe)(?=.*index)"): - make_dataframe(X, index=[0, 1]) - - -def test_make_series(): - X = [1, 2, 3] - expected_series = pl.Series(X) - assert_series_equal(make_series(X, index=None), expected_series) - - with pytest.raises(ValueError, match=r"(?=.*Polars series)(?=.*index)"): - make_series(X, index=[0, 1]) diff --git a/skrub/tests/test_agg_joiner.py b/skrub/tests/test_agg_joiner.py index 6ed8ac2af..bd0ebf440 100644 --- a/skrub/tests/test_agg_joiner.py +++ b/skrub/tests/test_agg_joiner.py @@ -3,7 +3,7 @@ from pandas.testing import assert_frame_equal from sklearn.pipeline import make_pipeline -from skrub.dataframe import POLARS_SETUP +from skrub._dataframe._polars import POLARS_SETUP if POLARS_SETUP: import polars as pl From 54502dfaf1cadb9f8e6aa07b26e5fb3465db64b4 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Tue, 31 Oct 2023 18:02:43 +0100 Subject: [PATCH 5/7] add private modules --- skrub/_dataframe/__init__.py | 0 skrub/_dataframe/_namespace.py | 99 +++++++ skrub/_dataframe/_pandas.py | 328 +++++++++++++++++++++++ skrub/_dataframe/_polars.py | 263 ++++++++++++++++++ skrub/_dataframe/tests/__init__.py | 0 skrub/_dataframe/tests/test_namespace.py | 42 +++ skrub/_dataframe/tests/test_pandas.py | 109 ++++++++ skrub/_dataframe/tests/test_polars.py | 93 +++++++ 8 files changed, 934 insertions(+) create mode 100644 skrub/_dataframe/__init__.py create mode 100644 skrub/_dataframe/_namespace.py create mode 100644 skrub/_dataframe/_pandas.py create mode 100644 skrub/_dataframe/_polars.py create mode 100644 skrub/_dataframe/tests/__init__.py create mode 100644 skrub/_dataframe/tests/test_namespace.py create mode 100644 skrub/_dataframe/tests/test_pandas.py create mode 100644 skrub/_dataframe/tests/test_polars.py diff --git a/skrub/_dataframe/__init__.py b/skrub/_dataframe/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/skrub/_dataframe/_namespace.py b/skrub/_dataframe/_namespace.py new file mode 100644 index 000000000..06c65a2ea --- /dev/null +++ b/skrub/_dataframe/_namespace.py @@ -0,0 +1,99 @@ +import sys + +import pandas as pd + +import skrub._dataframe._pandas as skrub_pd +import skrub._dataframe._polars as skrub_pl + + +def is_pandas(dataframe): + """Check whether the input is a Pandas dataframe. + + Parameters + ---------- + dataframe : DataFrameLike + The input dataframe + + Returns + ------- + is_pandas : bool + Whether the dataframe is a Pandas dataframe or not. + """ + return isinstance(dataframe, pd.DataFrame) + + +def is_polars(dataframe): + """Check whether the input is a Polars dataframe or lazyframe. + + Parameters + ---------- + dataframe : DataFrameLike + The input dataframe + + Returns + ------- + is_polars : bool + Whether the dataframe is a Polars dataframe/lazyframe or not. + """ + if "polars" not in sys.modules: + return False + + import polars as pl + + return isinstance(dataframe, (pl.DataFrame, pl.LazyFrame)) + + +def get_df_namespace(*dfs): + """Get the namespaces of dataframes. + + Introspects dataframes and returns their skrub namespace object + ``skrub.dataframe._{pandas, polars}`` and the dataframe module + ``{polars, pandas}`` itself. + + The dataframes passed in input need to come from the same module, otherwise a + ``TypeError`` will be raised. + + The outputs of this function are denoted ``skrub_px`` and ``px`` in reference to + the array API, returning namespace (NumPy, PyTorch and CuPy) as ``nx``. + Since we deal with Polars (``pl``) and Pandas (``pd``), we use ``px`` + as a variable name. + + Parameters + ---------- + dfs : DataFrameLike | list[DataFrameLike], + The dataframes to extract modules from. + + Returns + ------- + skrub_px : ModuleType + Skrub namespace shared by dataframe objects. + + px : ModuleType + Dataframe namespace, i.e. Pandas or Polars module. + """ + # FIXME Pandas and Polars series will raise errors. + if all([is_pandas(df) for df in dfs]): + return skrub_pd, pd + + elif all([is_polars(df) for df in dfs]): + import polars as pl + + if all([isinstance(df, pl.DataFrame) for df in dfs]) or all( + [isinstance(df, pl.LazyFrame) for df in dfs] + ): + return skrub_pl, pl + else: + raise TypeError("Mixing Polars lazyframes and dataframes is not supported.") + + else: + modules = [type(df).__module__ for df in dfs] + if all([is_polars(df) or is_pandas(df) for df in dfs]): + raise TypeError( + "Mixing Pandas and Polars dataframes is not supported, " + f"got {modules=!r}." + ) + else: + raise TypeError( + "Only Pandas or Polars dataframes are currently supported, " + f"got {modules=!r}." + ) diff --git a/skrub/_dataframe/_pandas.py b/skrub/_dataframe/_pandas.py new file mode 100644 index 000000000..c8fb820b1 --- /dev/null +++ b/skrub/_dataframe/_pandas.py @@ -0,0 +1,328 @@ +""" +Pandas specialization of the aggregate and join operation. +""" +import re +from itertools import product + +import numpy as np +import pandas as pd + +from skrub._utils import atleast_1d_or_none + + +def make_dataframe(X, index=None): + """Convert an dictionary of columns into a Pandas dataframe. + + Parameters + ---------- + X : mapping from column name to 1d iterable + Input data to convert. + + index : 1d array-like, default=None + The index of the dataframe. + + Returns + ------- + X : Pandas dataframe + Converted output. + """ + return pd.DataFrame(X, index=index) + + +def make_series(X, index=None, name=None): + """Convert an 1d array into a Pandas series. + + Parameters + ---------- + X : 1d iterable + Input data to convert. + + index : 1d array-like, default=None + The index of the series. + + name : str, default=None + The name of the series. + + Returns + ------- + X : Pandas series + Converted output. + """ + return pd.Series(X, index=index, name=name) + + +def aggregate( + table, + key, + cols_to_agg, + num_operations=("mean",), + categ_operations=("mode",), + suffix=None, +): + """Aggregates a :obj:`pandas.DataFrame`. + + This function uses the ``dataframe.groupby(key).agg`` method from Pandas. + + Parameters + ---------- + table : pd.DataFrame, + The input dataframe to aggregate. + + key : str or Iterable[str], + The columns used as keys to aggregate on. + + cols_to_agg : str or Iterable[str], + The columns to aggregate. + + num_operations : str or Iterable[str], + The reduction functions to apply on numerical columns + in ``cols_to_agg`` during the aggregation. + + categ_operations : str or Iterable[str], + The reduction functions to apply on categorical columns + in ``cols_to_agg`` during the aggregation. + + suffix : str, optional + The suffix appended to output columns. + + Returns + ------- + group : pd.DataFrame, + The aggregated output. + """ + if not isinstance(table, pd.DataFrame): + raise TypeError(f"'table' must be a pandas dataframe, got {type(table)!r}.") + + key = atleast_1d_or_none(key) + cols_to_agg = atleast_1d_or_none(cols_to_agg) + num_operations = atleast_1d_or_none(num_operations) + categ_operations = atleast_1d_or_none(categ_operations) + suffix = "" if suffix is None else suffix + + num_cols, categ_cols = split_num_categ_cols(table[cols_to_agg]) + + num_named_agg, num_value_counts = get_named_agg(table, num_cols, num_operations) + categ_named_agg, categ_value_counts = get_named_agg( + table, categ_cols, categ_operations + ) + + named_agg = {**num_named_agg, **categ_named_agg} + if named_agg: + base_group = table.groupby(key).agg(**named_agg) + else: + base_group = None + + # 'histogram' and 'value_counts' requires a pivot + value_counts = {**num_value_counts, **categ_value_counts} + for output_key, (col_to_agg, kwargs) in value_counts.items(): + serie_group = table.groupby(key)[col_to_agg].value_counts(**kwargs) + serie_group.name = output_key + pivot = ( + serie_group.reset_index() + .pivot(index=key, columns=col_to_agg) + .reset_index() + .fillna(0) + ) + cols = pivot.columns.droplevel(0) + index_cols = np.atleast_1d(key).tolist() + feature_cols = (f"{col_to_agg}_" + cols[len(index_cols) :].astype(str)).tolist() + cols = [*index_cols, *feature_cols] + pivot.columns = cols + + if base_group is None: + base_group = pivot + else: + base_group = base_group.merge(pivot, on=key, how="left") + + if base_group is None: + raise ValueError("No aggregation to perform.") + + base_group.columns = [ + f"{col}{suffix}" if col not in key else col for col in base_group.columns + ] + sorted_cols = sorted(base_group.columns) + + return base_group[sorted_cols] + + +def join( + left, + right, + left_on, + right_on, +): + """Left join two :obj:`pandas.DataFrame`. + + This function uses the ``dataframe.merge`` method from Pandas. + + Parameters + ---------- + left : pd.DataFrame, + The left dataframe to left-join. + + right : pd.DataFrame, + The right dataframe to left-join. + + left_on : str or Iterable[str] + Left keys to merge on. + + right_on : str or Iterable[str] + Right keys to merge on. + + Returns + ------- + merged : pd.DataFrame, + The merged output. + """ + if not (isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame)): + raise TypeError( + "'left' and 'right' must be pandas dataframes, " + f"got {type(left)!r} and {type(right)!r}." + ) + return left.merge( + right, + how="left", + left_on=left_on, + right_on=right_on, + ) + + +def get_named_agg(table, cols, operations): + """Map aggregation tuples to their output key. + + The dictionary has the form: output_key = (column, aggfunc). + This is used as input for the ``dataframe.agg`` method from Pandas. + + 'value_counts' and 'hist' operation require to pivot + the tables and treated in a separate mapping. + + Parameters + ---------- + table : pd.DataFrame, + Input dataframe, only used to compute bins values if + 'value_counts' or 'hist' are operations. + + cols : list, + The columns to aggregate. + + operations : list, + The reduce operations to perform. + + Returns + ------- + named_agg : dict, + Named aggregation mapping. + + value_counts : dict, + ``value_counts`` operations mapping. + """ + named_agg, value_counts = {}, {} + for col, operation in product(cols, operations): + op_root, bin_args = _parse_argument(operation) + aggfunc, bin_args = _get_aggfunc(table[col], op_root, bin_args) + + output_key = f"{col}_{op_root}" + # 'value_counts' change the index of the resulting frame + # and must be treated separately. + if aggfunc == "value_counts": + value_counts[output_key] = (col, bin_args) + else: + named_agg[output_key] = (col, aggfunc) + + return named_agg, value_counts + + +def _parse_argument(operation): + """Split a text input into a function name and its argument. + + Parameters + ---------- + operation : str, + The operation to parse. + + Returns + ------- + operation_root : str, + The name of the operation before parenthesis, if any. + + bin_args : int, + The number of bin to create for ``hist`` or ``value_counts``. + + Examples + -------- + >>> _parse_argument("hist(10)") + ('hist', 10) + """ + split = re.split("\\(.+\\)", operation) + op_root = split[0] + if len(split) > 1: + # remove op_root + bin_args = re.split(f"^{op_root}", operation) + bin_args = bin_args[1] + # remove parenthesis + bin_args = re.sub("\\(|\\)", "", bin_args) + bin_args = int(bin_args) + return op_root, bin_args + else: + return op_root, None + + +PANDAS_OPS_MAPPING = { + "mode": pd.Series.mode, + "quantile": pd.Series.quantile, + "hist": "value_counts", +} + + +def _get_aggfunc(serie, op_root, n_bins): + """Map operation roots to their pandas agg functions. + + When args is provided for histogram or value_counts, + we create args + + Parameters + ---------- + serie : pd.Series, + Input series, used to compute the bins if n_bins is provided. + + op_root : str, + Operation root, the operation without the bin argument, if any. + + n_bins : int, + The number of bin to create when value_counts or hist operation are used. + + Returns + ------- + aggfunc : str or callable, + The pandas agg functions to perform + + bins_args : dict, + The bins to create when using value_counts or hist. + """ + aggfunc = PANDAS_OPS_MAPPING.get(op_root, op_root) + + if n_bins is not None: + # histogram and value_counts + if aggfunc == "value_counts": + # If bins is a number, we need to set a fix bin range, + # otherwise bins edges will be defined dynamically for + # each rows. + min_, max_ = serie.min(), serie.max() + bins = np.linspace(min_, max_, n_bins + 1) + bins_args = dict(bins=bins) + else: + raise ValueError( + f"Operator {op_root!r} doesn't take any argument, got {n_bins!r}" + ) + else: + bins_args = {} + + return aggfunc, bins_args + + +def split_num_categ_cols(table): + """Split dataframe columns between numerical and categorical.""" + num_cols = table.select_dtypes("number").columns + categ_cols = table.select_dtypes(["object", "string", "category"]).columns + + return num_cols, categ_cols diff --git a/skrub/_dataframe/_polars.py b/skrub/_dataframe/_polars.py new file mode 100644 index 000000000..20f7d1913 --- /dev/null +++ b/skrub/_dataframe/_polars.py @@ -0,0 +1,263 @@ +""" +Polars specialization of the aggregate and join operations. +""" +try: + import polars as pl + import polars.selectors as cs + + POLARS_SETUP = True +except ImportError: + POLARS_SETUP = False + +from itertools import product + +from skrub._utils import atleast_1d_or_none + + +def make_dataframe(X, index=None): + """Convert an dictionary of columns into a Polars dataframe. + + Parameters + ---------- + X : mapping from column name to 1d iterable + Input data to convert. + + index : 1d array-like, default=None + Unused since polars doesn't use index. + Only here for compatibility with Pandas. + + Returns + ------- + X : Polars dataframe + Converted output. + """ + if index is not None: + raise ValueError( + "Polars dataframes don't have an index, but " + f"the Polars dataframe maker was called with {index=!r}." + ) + return pl.DataFrame(X) + + +def make_series(X, index=None, name=None): + """Convert an 1d array into a Polars series. + + Parameters + ---------- + X : 1d iterable + Input data to convert. + + index : 1d array-like, default=None + Unused since polars doesn't use index. + Only here for compatibility with Pandas. + + name : str, default=None + The name of the series. + + Returns + ------- + X : Polars series + Converted output. + """ + if index is not None: + raise ValueError( + "Polars series don't have an index, but " + f"the Polars series maker was called with {index=!r}." + ) + return pl.Series(values=X, name=name) + + +def aggregate( + table, + key, + cols_to_agg, + num_operations=("mean",), + categ_operations=("mode",), + suffix=None, +): + """Aggregate a :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`. + + This function uses the ``dataframe.group_by(key).agg`` method from Polars. + + Parameters + ---------- + table : pl.DataFrame or pl.LazyFrame, + The input dataframe to aggregate. + + key : str or Iterable[str], + The columns used as keys to aggregate on. + + cols_to_agg : str or Iterable[str], + The columns to aggregate. + + num_operations : str or Iterable[str], + The reduction functions to apply on numerical columns + in ``cols_to_agg`` during the aggregation. + + categ_operations : str or Iterable[str], + The reduction functions to apply on categorical columns + in ``cols_to_agg`` during the aggregation. + + suffix : str, + The suffix appended to output columns. + + Returns + ------- + group : pl.DataFrame or pl.LazyFrame, + The aggregated output. + """ + if not isinstance(table, (pl.DataFrame, pl.LazyFrame)): + raise TypeError( + f"'table' must be a polars dataframe or lazyframe, got {type(table)!r}." + ) + + key = atleast_1d_or_none(key) + cols_to_agg = atleast_1d_or_none(cols_to_agg) + num_operations = atleast_1d_or_none(num_operations) + categ_operations = atleast_1d_or_none(categ_operations) + suffix = "" if suffix is None else suffix + + num_cols, categ_cols = split_num_categ_cols(table.select(cols_to_agg)) + + num_aggfuncs, num_mode_cols = get_aggfuncs(num_cols, num_operations) + categ_aggfuncs, categ_mode_cols = get_aggfuncs(categ_cols, categ_operations) + + aggfuncs = [*num_aggfuncs, *categ_aggfuncs] + # If aggfuncs is empty, the output will be a series of index. + table = table.group_by(key).agg(aggfuncs) + + # flattening post-processing of mode() cols + flatten_ops = [] + for col in [*num_mode_cols, *categ_mode_cols]: + flatten_ops.append(pl.col(col).list[0].alias(col)) + # add columns, no-op if 'flatten_ops' is empty. + table = table.with_columns(flatten_ops) + + cols_renaming = {col: f"{col}{suffix}" for col in table.columns if col not in key} + table = table.rename(cols_renaming) + sorted_cols = sorted(table.columns) + + return table.select(sorted_cols) + + +def join(left, right, left_on, right_on): + """Left join two :obj:`polars.DataFrame` or :obj:`polars.LazyFrame`. + + This function uses the ``dataframe.join`` method from Polars. + + Note that the input dataframes type must agree: either both + Polars dataframes or both Polars lazyframes. + + Mixing polars dataframe with lazyframe will raise an error. + + Parameters + ---------- + left : pl.DataFrame or pl.LazyFrame, + The left dataframe of the left-join. + + right : pl.DataFrame or pl.LazyFrame, + The right dataframe of the left-join. + + left_on : str or Iterable[str], + Left keys to merge on. + + right_on : str or Iterable[str], + Right keys to merge on. + + Returns + ------- + merged : pl.DataFrame or pl.LazyFrame, + The merged output. + """ + is_dataframe = isinstance(left, pl.DataFrame) and isinstance(right, pl.DataFrame) + is_lazyframe = isinstance(left, pl.LazyFrame) and isinstance(right, pl.LazyFrame) + if is_dataframe or is_lazyframe: + return left.join( + right, + how="left", + left_on=left_on, + right_on=right_on, + ) + else: + raise TypeError( + "'left' and 'right' must be polars dataframes or lazyframes, " + f"got {type(left)!r} and {type(right)!r}." + ) + + +def get_aggfuncs(cols, operations): + """List Polars aggregation functions. + + The list is used as input for the ``dataframe.group_by().agg()`` method from Polars. + The 'mode' operation needs a flattening post-processing. + + Parameters + ---------- + cols : list, + The columns to aggregate. + + operations : list, + The reduce operations to perform. + + Returns + ------- + aggfuncs : list, + Named aggregation list. + + mode_cols : list, + Output keys to post-process after 'mode' aggregation. + """ + aggfuncs, mode_cols = [], [] + for col, operation in product(cols, operations): + output_key = f"{col}_{operation}" + aggfunc = _polars_ops_mapping(col, operation, output_key) + aggfuncs.append(aggfunc) + + if operation == "mode": + mode_cols.append(output_key) + + return aggfuncs, mode_cols + + +def _polars_ops_mapping(col, operation, output_key): + """Map an operation to its Polars expression. + + Parameters + ---------- + col : str, + Name of the column to aggregate. + operation : str, + Name of the reduce function. + output_key : str, + Name of the reduced column. + + Returns + ------- + aggfunc: polars.Expression, + The expression to apply. + """ + polars_aggfuncs = { + "mean": pl.col(col).mean(), + "std": pl.col(col).std(), + "sum": pl.col(col).sum(), + "min": pl.col(col).min(), + "max": pl.col(col).max(), + "mode": pl.col(col).mode(), + } + aggfunc = polars_aggfuncs.get(operation, None) + + if aggfunc is None: + raise ValueError( + f"Polars operation {operation!r} is not supported. Available:" + f" {list(polars_aggfuncs)}" + ) + + return aggfunc.alias(output_key) + + +def split_num_categ_cols(table): + """Split a dataframe columns between numerical and categorical.""" + num_cols = table.select(cs.numeric()).columns + categ_cols = table.select(cs.string()).columns + + return num_cols, categ_cols diff --git a/skrub/_dataframe/tests/__init__.py b/skrub/_dataframe/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/skrub/_dataframe/tests/test_namespace.py b/skrub/_dataframe/tests/test_namespace.py new file mode 100644 index 000000000..b6d944f52 --- /dev/null +++ b/skrub/_dataframe/tests/test_namespace.py @@ -0,0 +1,42 @@ +import pandas as pd +import pytest + +import skrub._dataframe._pandas as skrub_pd +import skrub._dataframe._polars as skrub_pl +from skrub._dataframe._namespace import get_df_namespace +from skrub._dataframe._polars import POLARS_SETUP + +main = pd.DataFrame( + { + "userId": [1, 1, 1, 2, 2, 2], + "movieId": [1, 3, 6, 318, 6, 1704], + "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], + "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], + } +) + + +def test_get_namespace_pandas(): + skrub_px, px = get_df_namespace(main, main) + assert skrub_px is skrub_pd + assert px is pd + + with pytest.raises(TypeError, match=r"(?=.*Only Pandas or Polars)(?=.*supported)"): + get_df_namespace(main, main.values) + + +@pytest.mark.skipif(not POLARS_SETUP, reason="Polars is not available") +def test_get_namespace_polars(): + import polars as pl + + skrub_px, px = get_df_namespace(pl.DataFrame(main), pl.DataFrame(main)) + assert skrub_px is skrub_pl + assert px is pl + + with pytest.raises(TypeError, match=r"(?=.*Mixing Pandas)(?=.*Polars)"): + get_df_namespace(main, pl.DataFrame(main)) + + with pytest.raises( + TypeError, match=r"(?=.*Mixing)(?=.*lazyframes)(?=.*dataframes)" + ): + get_df_namespace(pl.DataFrame(main), pl.LazyFrame(main)) diff --git a/skrub/_dataframe/tests/test_pandas.py b/skrub/_dataframe/tests/test_pandas.py new file mode 100644 index 000000000..7a245a365 --- /dev/null +++ b/skrub/_dataframe/tests/test_pandas.py @@ -0,0 +1,109 @@ +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal, assert_series_equal + +from skrub._dataframe._pandas import aggregate, join, make_dataframe, make_series + +main = pd.DataFrame( + { + "userId": [1, 1, 1, 2, 2, 2], + "movieId": [1, 3, 6, 318, 6, 1704], + "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], + "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], + } +) + + +def test_join(): + joined = join(left=main, right=main, left_on="movieId", right_on="movieId") + expected = main.merge(main, on="movieId", how="left") + assert_frame_equal(joined, expected) + + +def test_simple_agg(): + aggregated = aggregate( + table=main, + key="movieId", + cols_to_agg=["rating", "genre"], + num_operations="mean", + categ_operations="mode", + ) + aggfunc = { + "genre_mode": ("genre", pd.Series.mode), + "rating_mean": ("rating", "mean"), + } + expected = main.groupby("movieId").agg(**aggfunc) + assert_frame_equal(aggregated, expected) + + +def test_value_counts_agg(): + aggregated = aggregate( + table=main, + key="userId", + cols_to_agg="rating", + num_operations="value_counts", + categ_operations=None, + suffix="_user", + ) + expected = pd.DataFrame( + { + "rating_2.0_user": [0.0, 1.0], + "rating_3.0_user": [0.0, 1.0], + "rating_4.0_user": [3.0, 1.0], + "userId": [1, 2], + } + ) + assert_frame_equal(aggregated, expected) + + aggregated = aggregate( + table=main, + key="userId", + cols_to_agg="rating", + num_operations="hist(2)", + categ_operations=None, + suffix="_user", + ) + expected = pd.DataFrame( + { + "rating_(1.999, 3.0]_user": [0, 2], + "rating_(3.0, 4.0]_user": [3, 1], + "userId": [1, 2], + } + ) + assert_frame_equal(aggregated, expected) + + +def test_incorrect_dataframe_inputs(): + with pytest.raises(TypeError, match=r"(?=.*pandas dataframes)(?=.*array)"): + join(left=main.values, right=main, left_on="movieId", right_on="movieId") + + with pytest.raises(TypeError, match=r"(?=.*pandas dataframe)(?=.*array)"): + aggregate( + table=main.values, + key="movieId", + cols_to_agg="rating", + num_operations="mean", + ) + + +def test_no_agg_operation(): + with pytest.raises(ValueError, match=r"(?=.*No aggregation)"): + aggregate( + table=main, + key="movieId", + cols_to_agg="rating", + num_operations=None, + categ_operations=None, + ) + + +def test_make_dataframe(): + X = dict(a=[1, 2], b=["z", "e"]) + expected_df = pd.DataFrame(dict(a=[1, 2], b=["z", "e"])) + assert_frame_equal(make_dataframe(X, index=[0, 1]), expected_df) + + +def test_make_series(): + X = [1, 2, 3] + expected_series = pd.Series(X) + assert_series_equal(make_series(X, index=[0, 1, 2]), expected_series) diff --git a/skrub/_dataframe/tests/test_polars.py b/skrub/_dataframe/tests/test_polars.py new file mode 100644 index 000000000..2e55952a7 --- /dev/null +++ b/skrub/_dataframe/tests/test_polars.py @@ -0,0 +1,93 @@ +import pandas as pd +import pytest + +from skrub._dataframe._polars import ( + POLARS_SETUP, + aggregate, + join, + make_dataframe, + make_series, +) + +if POLARS_SETUP: + import polars as pl + from polars.testing import assert_frame_equal, assert_series_equal + + main = pl.DataFrame( + { + "userId": [1, 1, 1, 2, 2, 2], + "movieId": [1, 3, 6, 318, 6, 1704], + "rating": [4.0, 4.0, 4.0, 3.0, 2.0, 4.0], + "genre": ["drama", "drama", "comedy", "sf", "comedy", "sf"], + } + ) +else: + polars_missing_msg = "Polars is not available" + pytest.skip(reason=polars_missing_msg, allow_module_level=True) + + +def test_join(): + joined = join(left=main, right=main, left_on="movieId", right_on="movieId") + expected = main.join(main, on="movieId", how="left") + assert_frame_equal(joined, expected) + + +def test_simple_agg(): + aggregated = aggregate( + table=main, + key="movieId", + cols_to_agg="rating", + num_operations="mean", + ) + aggfunc = pl.col("rating").mean().alias("rating_mean") + expected = main.group_by("movieId").agg(aggfunc) + # As group_by parallizes threads, the row order of its output isn't + # deterministic. Hence, we need to set check_row_order to False. + assert_frame_equal(aggregated, expected, check_row_order=False) + + +def test_mode_agg(): + aggregated = aggregate( + table=main, + key="movieId", + cols_to_agg="genre", + categ_operations=["mode"], + ) + expected = pl.DataFrame( + { + "genre_mode": ["drama", "drama", "sf", "sf", "comedy"], + "movieId": [3, 1, 318, 1704, 6], + } + ) + assert_frame_equal(aggregated, expected, check_row_order=False) + + +def test_incorrect_dataframe_inputs(): + with pytest.raises(TypeError, match=r"(?=.*polars dataframes)(?=.*pandas)"): + join(left=pd.DataFrame(main), right=main, left_on="movieId", right_on="movieId") + + with pytest.raises(TypeError, match=r"(?=.*polars dataframe)(?=.*pandas)"): + aggregate( + table=pd.DataFrame(main), + key="movieId", + cols_to_agg="rating", + num_operations="mean", + ) + + +def test_make_dataframe(): + X = dict(a=[1, 2], b=["z", "e"]) + expected_df = pl.DataFrame(dict(a=[1, 2], b=["z", "e"])) + assert_frame_equal(make_dataframe(X), expected_df) + + with pytest.raises(ValueError, match=r"(?=.*Polars dataframe)(?=.*index)"): + make_dataframe(X, index=[0, 1]) + + +def test_make_series(): + X = [1, 2, 3] + expected_series = pl.Series(X) + assert_series_equal(make_series(X, index=None), expected_series) + + with pytest.raises(ValueError, match=r"(?=.*Polars series)(?=.*index)"): + make_series(X, index=[0, 1]) From c00ef25f0a066aa8d4747fa532f54e8206edc4dd Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 2 Nov 2023 11:24:58 +0100 Subject: [PATCH 6/7] fix select cols --- skrub/_select_cols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py index 183562d27..8daade477 100644 --- a/skrub/_select_cols.py +++ b/skrub/_select_cols.py @@ -1,6 +1,6 @@ from sklearn.base import BaseEstimator, TransformerMixin -from .dataframe import get_df_namespace +from ._dataframe._namespace import get_df_namespace def _check_columns(df, columns): From 94bc74480b622b2f80545b603a416a0892928567 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 2 Nov 2023 15:14:16 +0100 Subject: [PATCH 7/7] fix tests --- skrub/tests/test_select_cols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/tests/test_select_cols.py b/skrub/tests/test_select_cols.py index 0e36866be..3ab07e590 100644 --- a/skrub/tests/test_select_cols.py +++ b/skrub/tests/test_select_cols.py @@ -3,7 +3,7 @@ import pytest from skrub import DropCols, SelectCols -from skrub.dataframe import POLARS_SETUP +from skrub._dataframe._polars import POLARS_SETUP DATAFRAME_MODULES = [pandas] if POLARS_SETUP: