skrub-data · GaelVaroquaux · Oct 27, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -45,6 +45,11 @@ Major changes
   aggregation on the target y, followed by left-joining on a base table.
   :pr:`600` by :user:`Vincent Maladiere <Vincent-Maladiere>`.
 
+* Added the :class:`SelectCols` and :class:`DropCols` transformers that allow
+  selecting a subset of a dataframe's columns inside of a pipeline. :pr:`804` by
+  :user:`Jérôme Dockès <jeromedockes>`.
+
+
 Minor changes
 -------------
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -36,6 +36,20 @@ This page lists all available functions and classes of `skrub`.
    AggTarget
 
 
+.. raw:: html
+
+   <h2>Column selection in a pipeline</h2>
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+   :nosignatures:
+   :caption: Column selection in a pipeline
+
+   SelectCols
+   DropCols
+
+
 .. raw:: html
 
    <h2>Vectorizing a dataframe</h2>

diff --git a/doc/assembling.rst b/doc/assembling.rst
@@ -50,6 +50,17 @@ In addition, skrub also enable more advanced analysis:
   leakage, then join the result back on the main table, similar to AggJoiner.
 
 
+Column selection inside a pipeline
+----------------------------------
+
+Besides joins, another common operation on a dataframe is to select a subset of its columns (also known as a projection).
+We sometimes need to perform such a selection in the middle of a pipeline, for example if we need a column for a join (with :class:`Joiner`), but in a subsequent step we want to drop that column before fitting an estimator.
+
+skrub provides transformers to perform such an operation:
+
+- :class:`SelectCols` allows specifying the columns we want to keep.
+- Conversely :class:`DropCols` allows specifying the columns we want to discard.
+
 Going further: embeddings for better analytics
 ----------------------------------------------
 

diff --git a/examples/04_fuzzy_joining.py b/examples/04_fuzzy_joining.py
@@ -362,51 +362,31 @@
 # .............................
 
 y = df["Happiness score"]
-#######################################################################
-# We gather the auxilliary tables into a
-# list of (tables, keys) for the `tables` parameter.
-# An instance of the transformer with the necessary information is:
-from skrub import Joiner
-
-joiner = Joiner(
-    tables=[
-        (gdppc, "Country Name"),
-        (life_exp, "Country Name"),
-        (legal_rights, "Country Name"),
-    ],
-    main_key="Country",
-)
-
-#################################################################
-# Fitting and transforming into the final table
-# .............................................
-# To get our final joined table we will fit and transform the main table (df)
-# with our create instance of the |joiner|:
-df_final = joiner.fit_transform(df)
-
-df_final.head(10)
+df = df.drop("Happiness score", axis=1)
 
-##########################################################################
-# And that's it! As previously, we now have a big table
-# ready for machine learning.
-# Let's create our machine learning pipeline:
-from sklearn.compose import make_column_transformer
+from skrub import Joiner, SelectCols, DropCols
 from sklearn.pipeline import make_pipeline
 
-# We include only the columns that will be pertinent for our regression:
-encoder = make_column_transformer(
-    (
-        "passthrough",
-        [
-            "GDP per capita (current US$)",
-            "Life expectancy at birth, total (years)",
-            "Strength of legal rights index (0=weak to 12=strong)",
-        ],
-    ),
-    remainder="drop",
+# We create a selector that we will insert at the end of our pipeline, to
+# select the relevant columns before fitting the regressor
+
+selector = SelectCols(
+    [
+        "GDP per capita (current US$)",
+        "Life expectancy at birth, total (years)",
+        "Strength of legal rights index (0=weak to 12=strong)",
+    ]
+)
+pipeline = make_pipeline(
+    Joiner((gdppc, "Country Name"), "Country"),
+    DropCols("Country Name"),
+    Joiner((life_exp, "Country Name"), "Country"),
+    DropCols("Country Name"),
+    Joiner((legal_rights, "Country Name"), "Country"),
+    selector,
+    HistGradientBoostingRegressor(),
 )
 
-pipeline = make_pipeline(joiner, encoder, HistGradientBoostingRegressor())
 
 ##########################################################################
 # And the best part is that we are now able to evaluate the parameters of the |fj|.
@@ -416,12 +396,17 @@
 from sklearn.model_selection import GridSearchCV
 
 # We will test four possible values of match_score:
-params = {"joiner__match_score": [0.2, 0.3, 0.4, 0.5]}
+params = {
+    "joiner-1__match_score": [0.2, 0.9],
+    "joiner-2__match_score": [0.2, 0.9],
+    "joiner-3__match_score": [0.2, 0.9],
+}
 
 grid = GridSearchCV(pipeline, param_grid=params)
 grid.fit(df, y)
 
 print(grid.best_params_)
+
 ##########################################################################
 # The grid searching gave us the best value of 0.5 for the parameter
 # ``match_score``. Let's use this value in our regression:

diff --git a/skrub/__init__.py b/skrub/__init__.py
@@ -11,6 +11,7 @@
 from ._gap_encoder import GapEncoder
 from ._joiner import Joiner
 from ._minhash_encoder import MinHashEncoder
+from ._select_cols import DropCols, SelectCols
 from ._similarity_encoder import SimilarityEncoder
 from ._table_vectorizer import SuperVectorizer, TableVectorizer
 from ._target_encoder import TargetEncoder
@@ -35,4 +36,6 @@
     "compute_ngram_distance",
     "AggJoiner",
     "AggTarget",
+    "SelectCols",
+    "DropCols",
 ]
diff --git a/skrub/_select_cols.py b/skrub/_select_cols.py
@@ -0,0 +1,179 @@
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from .dataframe import get_df_namespace
+
+
+def _check_columns(df, columns):
+    """Check that provided columns exist in the dataframe and return them in a list.
+
+    Checking this ourselves allows having the same exception for both pandas
+    and polars dataframes.
+
+    If `df` is not a dataframe (does not have a ``columns`` attribute), skip
+    the check. As the transformers in this module are basically stateless,
+    this allows getting an operational transformer without fit data; for
+    example ``selector = SelectCols(["A", "B"]).fit(None)``, as the fit data is
+    not used for anything else than this check.
+
+    If ``columns`` is a ``str`` (a single column name), the return value wraps
+    it in a list (of length 1).
+    """
+    if isinstance(columns, str):
+        columns = [columns]
+    columns = list(columns)
+    if not hasattr(df, "columns"):
+        return columns
+    diff = set(columns) - set(df.columns)
+    if not diff:
+        return columns
+    raise ValueError(
+        f"The following columns were not found in the input DataFrame: {diff}"
+    )
+
+
+class SelectCols(TransformerMixin, BaseEstimator):
+    """Select a subset of a DataFrame's columns.
+
+    A ``ValueError`` is raised if any of the provided column names are not in
+    the dataframe.
+
+    Accepts :obj:`pandas.DataFrame` and :obj:`polars.DataFrame` inputs.
+
+    Parameters
+    ----------
+    cols : list of str or str
+        The columns to select. A single column name can be passed as a ``str``:
+        ``"col_name"`` is the same as ``["col_name"]``.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({"A": [1, 2], "B": [10, 20], "C": ["x", "y"]})
+    >>> df
+       A   B  C
+    0  1  10  x
+    1  2  20  y
+    >>> SelectCols(["C", "A"]).fit_transform(df)
+       C  A
+    0  x  1
+    1  y  2
+    >>> SelectCols(["X", "A"]).fit_transform(df)
+    Traceback (most recent call last):
+        ...
+    ValueError: The following columns were not found in the input DataFrame: {'X'}
+    """
+
+    def __init__(self, cols):
+        self.cols = cols
+
+    def fit(self, X, y=None):
+        """Fit the transformer.
+
+        Parameters
+        ----------
+        X : DataFrame or None
+            If `X` is a DataFrame, the transformer checks that all the column
+            names provided in ``self.cols`` can be found in `X`.
+
+        y : None
+            Unused.
+
+        Returns
+        -------
+        SelectCols
+            The transformer itself.
+        """
+        _check_columns(X, self.cols)
+        return self
+
+    def transform(self, X):
+        """Transform a dataframe by selecting columns.
+
+        Parameters
+        ----------
+        X : DataFrame
+            The DataFrame on which to apply the selection.
+
+        Returns
+        -------
+        DataFrame
+            The input DataFrame ``X`` after selecting only the columns listed
+            in ``self.cols`` (in the provided order).
+        """
+        cols = _check_columns(X, self.cols)
+        namespace, _ = get_df_namespace(X)
+        return namespace.select(X, cols)
+
+
+class DropCols(TransformerMixin, BaseEstimator):
+    """Drop a subset of a DataFrame's columns.
+
+    The other columns are kept in their original order. A ``ValueError`` is
+    raised if any of the provided column names are not in the dataframe.
+
+    Accepts :obj:`pandas.DataFrame` and :obj:`polars.DataFrame` inputs.
+
+    Parameters
+    ----------
+    cols : list of str or str
+        The columns to drop. A single column name can be passed as a ``str``:
+        ``"col_name"`` is the same as ``["col_name"]``.
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({"A": [1, 2], "B": [10, 20], "C": ["x", "y"]})
+    >>> df
+       A   B  C
+    0  1  10  x
+    1  2  20  y
+    >>> DropCols(["A", "C"]).fit_transform(df)
+        B
+    0  10
+    1  20
+    >>> DropCols(["X"]).fit_transform(df)
+    Traceback (most recent call last):
+        ...
+    ValueError: The following columns were not found in the input DataFrame: {'X'}
+    """
+
+    def __init__(self, cols):
+        self.cols = cols
+
+    def fit(self, X, y=None):
+        """Fit the transformer.
+
+        Parameters
+        ----------
+        X : DataFrame or None
+            If `X` is a DataFrame, the transformer checks that all the column
+            names provided in ``self.cols`` can be found in `X`.
+
+        y : None
+            Unused.
+
+        Returns
+        -------
+        DropCols
+            The transformer itself.
+        """
+        _check_columns(X, self.cols)
+        return self
+
+    def transform(self, X):
+        """Transform a dataframe by dropping columns.
+
+        Parameters
+        ----------
+        X : DataFrame
+            The DataFrame on which to apply the selection.
+
+        Returns
+        -------
+        DataFrame
+            The input DataFrame ``X`` after dropping the columns listed in
+            ``self.cols``.
+        """
+        cols = _check_columns(X, self.cols)
+        namespace, _ = get_df_namespace(X)
+        return namespace.select(X, [c for c in X.columns if c not in cols])
diff --git a/skrub/dataframe/_pandas.py b/skrub/dataframe/_pandas.py
@@ -291,3 +291,7 @@ def split_num_categ_cols(table):
     categ_cols = table.select_dtypes(["object", "string", "category"]).columns
 
     return num_cols, categ_cols
+
+
+def select(dataframe, columns):
+    return dataframe[columns]
diff --git a/skrub/dataframe/_polars.py b/skrub/dataframe/_polars.py
@@ -216,3 +216,7 @@ def split_num_categ_cols(table):
     categ_cols = table.select(cs.string()).columns
 
     return num_cols, categ_cols
+
+
+def select(dataframe, columns):
+    return dataframe.select(columns)