Added DropNullColumn transformer to remove columns that contain only …

…nulls (skrub-data#1115)
jeromedockes · Nov 18, 2024 · 2cdf8ad · 2cdf8ad
1 parent 5d59bf5
commit 2cdf8ad
Show file tree

Hide file tree

Showing 7 changed files with 225 additions and 9 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -70,6 +70,9 @@ Minor changes
   is now always visible when scrolling the table. :pr:`1102` by :user:`Jérôme
   Dockès <jeromedockes>`.
 
+* Added a `DropColumnIfNull` transformer that drops columns that contain only null
+  values. :pr:`1115` by :user: `Riccardo Cappuzzo <riccardocappuzzo>`
+
 Bug fixes
 ---------
 

diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
@@ -74,6 +74,7 @@
     "to_datetime",
     "is_categorical",
     "to_categorical",
+    "is_all_null",
     #
     # Inspecting, selecting and modifying values
     #
@@ -841,6 +842,28 @@ def _to_categorical_polars(col):
     return _cast_polars(col, pl.Categorical())
 
 
+@dispatch
+def is_all_null(col):
+    raise NotImplementedError()
+
+
+@is_all_null.specialize("pandas", argument_type="Column")
+def _is_all_null_pandas(col):
+    return all(is_null(col))
+
+
+@is_all_null.specialize("polars", argument_type="Column")
+def _is_all_null_polars(col):
+    # Column type is Null
+    if col.dtype == pl.Null:
+        return True
+    # Column type is not Null, but all values are nulls: more efficient
+    if col.null_count() == col.len():
+        return True
+    # Column type is not Null, not all values are null (check if NaN etc.): slower
+    return all(is_null(col))
+
+
 #
 # Inspecting, selecting and modifying values
 # ==========================================

diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py
@@ -557,6 +557,33 @@ def test_to_categorical(df_module):
         assert list(s.cat.categories) == list("ab")
 
 
+def test_is_all_null(df_module):
+    """Check that is_all_null is evaluating null counts correctly."""
+
+    # Check that all null columns are marked as "all null"
+    assert ns.is_all_null(df_module.make_column("all_null", [None, None, None]))
+    assert ns.is_all_null(df_module.make_column("all_nan", [np.nan, np.nan, np.nan]))
+    assert ns.is_all_null(
+        df_module.make_column("all_nan_or_null", [np.nan, np.nan, None])
+    )
+
+    # Check that the other columns are *not* marked as "all null"
+    assert not ns.is_all_null(
+        df_module.make_column("almost_all_null", ["almost", None, None])
+    )
+    assert not ns.is_all_null(
+        df_module.make_column("almost_all_nan", [2.5, None, None])
+    )
+
+
+def test_is_all_null_polars(pl_module):
+    """Special case for polars: column is full of nulls, but doesn't have dtype Null"""
+    col = pl_module.make_column("col", [1, None, None])
+    col = col[1:]
+
+    assert ns.is_all_null(col)
+
+
 # Inspecting, selecting and modifying values
 # ==========================================
 #

diff --git a/skrub/_drop_column_if_null.py b/skrub/_drop_column_if_null.py
@@ -0,0 +1,48 @@
+# drop columns that contain all null values
+from sklearn.utils.validation import check_is_fitted
+
+from . import _dataframe as sbd
+from ._on_each_column import SingleColumnTransformer
+
+__all__ = ["DropColumnIfNull"]
+
+
+class DropColumnIfNull(SingleColumnTransformer):
+    """Drop a single column if it contains only Null, NaN, or a mixture of null
+    values. If at least one non-null value is found, the column is kept."""
+
+    def fit_transform(self, column, y=None):
+        """Fit the encoder and transform a column.
+
+        Parameters
+        ----------
+            column : Pandas or Polars series. The input column to check.
+            y : None. Ignored.
+
+        Returns
+        -------
+            The input column, or an empty list if the column contains only null values.
+        """
+        del y
+
+        self.drop_ = sbd.is_all_null(column)
+
+        return self.transform(column)
+
+    def transform(self, column):
+        """Transform a column.
+
+        Parameters:
+        -----------
+            column : Pandas or Polars series. The input column to check.
+
+        Returns
+        -------
+        column
+            The input column, or an empty list if the column contains only null values.
+        """
+        check_is_fitted(self)
+
+        if self.drop_:
+            return []
+        return column
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
@@ -16,6 +16,7 @@
 from ._clean_categories import CleanCategories
 from ._clean_null_strings import CleanNullStrings
 from ._datetime_encoder import DatetimeEncoder
+from ._drop_column_if_null import DropColumnIfNull
 from ._gap_encoder import GapEncoder
 from ._on_each_column import SingleColumnTransformer
 from ._select_cols import Drop
@@ -191,6 +192,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         similar functionality to what is offered by scikit-learn's
         :class:`~sklearn.compose.ColumnTransformer`.
 
+    drop_null_columns : bool, default=True
+        If set to `True`, columns that contain only null values are dropped.
+
     n_jobs : int, default=None
         Number of jobs to run in parallel.
         ``None`` means 1 unless in a joblib ``parallel_backend`` context.
@@ -309,12 +313,13 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
 
     Before applying the main transformer, the ``TableVectorizer`` applies
     several preprocessing steps, for example to detect numbers or dates that are
-    represented as strings. Moreover, a final post-processing step is applied to
-    all non-categorical columns in the encoder's output to cast them to float32.
+    represented as strings. By default, columns that contain only null values are
+    dropped. Moreover, a final post-processing step is applied to all
+    non-categorical columns in the encoder's output to cast them to float32.
     We can inspect all the processing steps that were applied to a given column:
 
     >>> vectorizer.all_processing_steps_['B']
-    [CleanNullStrings(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat32(), 'B_month': ToFloat32(), ...}]
+    [CleanNullStrings(), DropColumnIfNull(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat32(), 'B_month': ToFloat32(), ...}]
 
     Note that as the encoder (``DatetimeEncoder()`` above) produces multiple
     columns, the last processing step is not described by a single transformer
@@ -323,7 +328,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     ``all_processing_steps_`` is useful to inspect the details of the
     choices made by the ``TableVectorizer`` during preprocessing, for example:
 
-    >>> vectorizer.all_processing_steps_['B'][1]
+    >>> vectorizer.all_processing_steps_['B'][2]
     ToDatetime()
     >>> _.format_
     '%d/%m/%Y'
@@ -389,7 +394,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     ``ToDatetime()``:
 
     >>> vectorizer.all_processing_steps_
-    {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), ToFloat32(), PassThrough(), {'C': ToFloat32()}]}
+    {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropColumnIfNull(), ToFloat32(), PassThrough(), {'C': ToFloat32()}]}
 
     Specifying several ``specific_transformers`` for the same column is not allowed.
 
@@ -412,6 +417,7 @@ def __init__(
         numeric=NUMERIC_TRANSFORMER,
         datetime=DATETIME_TRANSFORMER,
         specific_transformers=(),
+        drop_null_columns=True,
         n_jobs=None,
     ):
         self.cardinality_threshold = cardinality_threshold
@@ -425,6 +431,7 @@ def __init__(
         self.datetime = _utils.clone_if_default(datetime, DATETIME_TRANSFORMER)
         self.specific_transformers = specific_transformers
         self.n_jobs = n_jobs
+        self.drop_null_columns = drop_null_columns
 
     def fit(self, X, y=None):
         """Fit transformer.
@@ -536,13 +543,19 @@ def add_step(steps, transformer, cols, allow_reject=False):
         cols = s.all() - self._specific_columns
 
         self._preprocessors = [CheckInputDataFrame()]
-        for transformer in [
-            CleanNullStrings(),
+
+        transformer_list = [CleanNullStrings()]
+        if self.drop_null_columns:
+            transformer_list.append(DropColumnIfNull())
+
+        transformer_list += [
             ToDatetime(),
             ToFloat32(),
             CleanCategories(),
             ToStr(),
-        ]:
+        ]
+
+        for transformer in transformer_list:
             add_step(self._preprocessors, transformer, cols, allow_reject=True)
 
         self._encoders = []

diff --git a/skrub/tests/test_drop_column_if_null.py b/skrub/tests/test_drop_column_if_null.py
@@ -0,0 +1,62 @@
+import numpy as np
+import pytest
+
+from skrub import _dataframe as sbd
+from skrub._drop_column_if_null import DropColumnIfNull
+
+
+@pytest.fixture
+def drop_null_table(df_module):
+    return df_module.make_dataframe(
+        {
+            "idx": [
+                1,
+                2,
+                3,
+            ],
+            "value_nan": [
+                np.nan,
+                np.nan,
+                np.nan,
+            ],
+            "value_null": [
+                None,
+                None,
+                None,
+            ],
+            "value_almost_nan": [
+                2.5,
+                np.nan,
+                np.nan,
+            ],
+            "value_almost_null": [
+                "almost",
+                None,
+                None,
+            ],
+            "mixed_null": [None, np.nan, None],
+        }
+    )
+
+
+def test_single_column(drop_null_table, df_module):
+    """Check that null columns are dropped and non-null columns are kept."""
+    dn = DropColumnIfNull()
+    assert dn.fit_transform(sbd.col(drop_null_table, "value_nan")) == []
+    assert dn.fit_transform(sbd.col(drop_null_table, "value_null")) == []
+    assert dn.fit_transform(sbd.col(drop_null_table, "mixed_null")) == []
+
+    df_module.assert_column_equal(
+        dn.fit_transform(sbd.col(drop_null_table, "idx")),
+        df_module.make_column("idx", [1, 2, 3]),
+    )
+
+    df_module.assert_column_equal(
+        dn.fit_transform(sbd.col(drop_null_table, "value_almost_nan")),
+        df_module.make_column("value_almost_nan", [2.5, np.nan, np.nan]),
+    )
+
+    df_module.assert_column_equal(
+        dn.fit_transform(sbd.col(drop_null_table, "value_almost_null")),
+        df_module.make_column("value_almost_null", ["almost", None, None]),
+    )
diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
@@ -164,6 +164,28 @@ def _get_datetimes_dataframe():
     )
 
 
+def _get_missing_values_dataframe(categorical_dtype="object"):
+    """
+    Creates a simple DataFrame with some columns that contain only missing values.
+    We'll use different types of missing values (np.nan, pd.NA, None)
+    to test how the vectorizer handles full null columns with mixed null values.
+    """
+    return pd.DataFrame(
+        {
+            "int": pd.Series([15, 56, pd.NA, 12, 44], dtype="Int64"),
+            "all_null": pd.Series(
+                [None, None, None, None, None], dtype=categorical_dtype
+            ),
+            "all_nan": pd.Series(
+                [np.nan, np.nan, np.nan, np.nan, np.nan], dtype="Float64"
+            ),
+            "mixed_nulls": pd.Series(
+                [np.nan, None, pd.NA, "NULL", "NA"], dtype=categorical_dtype
+            ),
+        }
+    )
+
+
 def test_fit_default_transform():
     X = _get_clean_dataframe()
     vectorizer = TableVectorizer()
@@ -506,8 +528,11 @@ def test_changing_types(X_train, X_test, expected_X_out):
     """
     table_vec = TableVectorizer(
         # only extract the total seconds
-        datetime=DatetimeEncoder(resolution=None)
+        datetime=DatetimeEncoder(resolution=None),
+        # True by default
+        drop_null_columns=False,
     )
+
     table_vec.fit(X_train)
     X_out = table_vec.transform(X_test)
     assert (X_out.isna() == expected_X_out.isna()).all().all()
@@ -734,3 +759,18 @@ def test_supervised_encoder(df_module):
     y = np.random.default_rng(0).normal(size=sbd.shape(X)[0])
     tv = TableVectorizer(low_cardinality=TargetEncoder())
     tv.fit_transform(X, y)
+
+
+def test_drop_null_column():
+    """Check that all null columns are dropped, and no more."""
+    # Don't drop null columns
+    X = _get_missing_values_dataframe()
+    tv = TableVectorizer(drop_null_columns=False)
+    transformed = tv.fit_transform(X)
+
+    assert sbd.shape(transformed) == sbd.shape(X)
+
+    # Drop null columns
+    tv = TableVectorizer(drop_null_columns=True)
+    transformed = tv.fit_transform(X)
+    assert sbd.shape(transformed) == (sbd.shape(X)[0], 1)