diff --git a/CHANGES.rst b/CHANGES.rst index fd61f782b..e520f914d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -70,6 +70,9 @@ Minor changes is now always visible when scrolling the table. :pr:`1102` by :user:`Jérôme Dockès `. +* Added a `DropColumnIfNull` transformer that drops columns that contain only null + values. :pr:`1115` by :user: `Riccardo Cappuzzo ` + Bug fixes --------- diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index 05f2d856f..e9b526915 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -74,6 +74,7 @@ "to_datetime", "is_categorical", "to_categorical", + "is_all_null", # # Inspecting, selecting and modifying values # @@ -841,6 +842,28 @@ def _to_categorical_polars(col): return _cast_polars(col, pl.Categorical()) +@dispatch +def is_all_null(col): + raise NotImplementedError() + + +@is_all_null.specialize("pandas", argument_type="Column") +def _is_all_null_pandas(col): + return all(is_null(col)) + + +@is_all_null.specialize("polars", argument_type="Column") +def _is_all_null_polars(col): + # Column type is Null + if col.dtype == pl.Null: + return True + # Column type is not Null, but all values are nulls: more efficient + if col.null_count() == col.len(): + return True + # Column type is not Null, not all values are null (check if NaN etc.): slower + return all(is_null(col)) + + # # Inspecting, selecting and modifying values # ========================================== diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py index 907b9a3a8..efa5e653b 100644 --- a/skrub/_dataframe/tests/test_common.py +++ b/skrub/_dataframe/tests/test_common.py @@ -557,6 +557,33 @@ def test_to_categorical(df_module): assert list(s.cat.categories) == list("ab") +def test_is_all_null(df_module): + """Check that is_all_null is evaluating null counts correctly.""" + + # Check that all null columns are marked as "all null" + assert ns.is_all_null(df_module.make_column("all_null", [None, None, None])) + assert ns.is_all_null(df_module.make_column("all_nan", [np.nan, np.nan, np.nan])) + assert ns.is_all_null( + df_module.make_column("all_nan_or_null", [np.nan, np.nan, None]) + ) + + # Check that the other columns are *not* marked as "all null" + assert not ns.is_all_null( + df_module.make_column("almost_all_null", ["almost", None, None]) + ) + assert not ns.is_all_null( + df_module.make_column("almost_all_nan", [2.5, None, None]) + ) + + +def test_is_all_null_polars(pl_module): + """Special case for polars: column is full of nulls, but doesn't have dtype Null""" + col = pl_module.make_column("col", [1, None, None]) + col = col[1:] + + assert ns.is_all_null(col) + + # Inspecting, selecting and modifying values # ========================================== # diff --git a/skrub/_drop_column_if_null.py b/skrub/_drop_column_if_null.py new file mode 100644 index 000000000..612160ef9 --- /dev/null +++ b/skrub/_drop_column_if_null.py @@ -0,0 +1,48 @@ +# drop columns that contain all null values +from sklearn.utils.validation import check_is_fitted + +from . import _dataframe as sbd +from ._on_each_column import SingleColumnTransformer + +__all__ = ["DropColumnIfNull"] + + +class DropColumnIfNull(SingleColumnTransformer): + """Drop a single column if it contains only Null, NaN, or a mixture of null + values. If at least one non-null value is found, the column is kept.""" + + def fit_transform(self, column, y=None): + """Fit the encoder and transform a column. + + Parameters + ---------- + column : Pandas or Polars series. The input column to check. + y : None. Ignored. + + Returns + ------- + The input column, or an empty list if the column contains only null values. + """ + del y + + self.drop_ = sbd.is_all_null(column) + + return self.transform(column) + + def transform(self, column): + """Transform a column. + + Parameters: + ----------- + column : Pandas or Polars series. The input column to check. + + Returns + ------- + column + The input column, or an empty list if the column contains only null values. + """ + check_is_fitted(self) + + if self.drop_: + return [] + return column diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index c2eabb511..2bc2e5129 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -16,6 +16,7 @@ from ._clean_categories import CleanCategories from ._clean_null_strings import CleanNullStrings from ._datetime_encoder import DatetimeEncoder +from ._drop_column_if_null import DropColumnIfNull from ._gap_encoder import GapEncoder from ._on_each_column import SingleColumnTransformer from ._select_cols import Drop @@ -191,6 +192,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator): similar functionality to what is offered by scikit-learn's :class:`~sklearn.compose.ColumnTransformer`. + drop_null_columns : bool, default=True + If set to `True`, columns that contain only null values are dropped. + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a joblib ``parallel_backend`` context. @@ -309,12 +313,13 @@ class TableVectorizer(TransformerMixin, BaseEstimator): Before applying the main transformer, the ``TableVectorizer`` applies several preprocessing steps, for example to detect numbers or dates that are - represented as strings. Moreover, a final post-processing step is applied to - all non-categorical columns in the encoder's output to cast them to float32. + represented as strings. By default, columns that contain only null values are + dropped. Moreover, a final post-processing step is applied to all + non-categorical columns in the encoder's output to cast them to float32. We can inspect all the processing steps that were applied to a given column: >>> vectorizer.all_processing_steps_['B'] - [CleanNullStrings(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat32(), 'B_month': ToFloat32(), ...}] + [CleanNullStrings(), DropColumnIfNull(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat32(), 'B_month': ToFloat32(), ...}] Note that as the encoder (``DatetimeEncoder()`` above) produces multiple columns, the last processing step is not described by a single transformer @@ -323,7 +328,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ``all_processing_steps_`` is useful to inspect the details of the choices made by the ``TableVectorizer`` during preprocessing, for example: - >>> vectorizer.all_processing_steps_['B'][1] + >>> vectorizer.all_processing_steps_['B'][2] ToDatetime() >>> _.format_ '%d/%m/%Y' @@ -389,7 +394,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ``ToDatetime()``: >>> vectorizer.all_processing_steps_ - {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), ToFloat32(), PassThrough(), {'C': ToFloat32()}]} + {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropColumnIfNull(), ToFloat32(), PassThrough(), {'C': ToFloat32()}]} Specifying several ``specific_transformers`` for the same column is not allowed. @@ -412,6 +417,7 @@ def __init__( numeric=NUMERIC_TRANSFORMER, datetime=DATETIME_TRANSFORMER, specific_transformers=(), + drop_null_columns=True, n_jobs=None, ): self.cardinality_threshold = cardinality_threshold @@ -425,6 +431,7 @@ def __init__( self.datetime = _utils.clone_if_default(datetime, DATETIME_TRANSFORMER) self.specific_transformers = specific_transformers self.n_jobs = n_jobs + self.drop_null_columns = drop_null_columns def fit(self, X, y=None): """Fit transformer. @@ -536,13 +543,19 @@ def add_step(steps, transformer, cols, allow_reject=False): cols = s.all() - self._specific_columns self._preprocessors = [CheckInputDataFrame()] - for transformer in [ - CleanNullStrings(), + + transformer_list = [CleanNullStrings()] + if self.drop_null_columns: + transformer_list.append(DropColumnIfNull()) + + transformer_list += [ ToDatetime(), ToFloat32(), CleanCategories(), ToStr(), - ]: + ] + + for transformer in transformer_list: add_step(self._preprocessors, transformer, cols, allow_reject=True) self._encoders = [] diff --git a/skrub/tests/test_drop_column_if_null.py b/skrub/tests/test_drop_column_if_null.py new file mode 100644 index 000000000..32248ffab --- /dev/null +++ b/skrub/tests/test_drop_column_if_null.py @@ -0,0 +1,62 @@ +import numpy as np +import pytest + +from skrub import _dataframe as sbd +from skrub._drop_column_if_null import DropColumnIfNull + + +@pytest.fixture +def drop_null_table(df_module): + return df_module.make_dataframe( + { + "idx": [ + 1, + 2, + 3, + ], + "value_nan": [ + np.nan, + np.nan, + np.nan, + ], + "value_null": [ + None, + None, + None, + ], + "value_almost_nan": [ + 2.5, + np.nan, + np.nan, + ], + "value_almost_null": [ + "almost", + None, + None, + ], + "mixed_null": [None, np.nan, None], + } + ) + + +def test_single_column(drop_null_table, df_module): + """Check that null columns are dropped and non-null columns are kept.""" + dn = DropColumnIfNull() + assert dn.fit_transform(sbd.col(drop_null_table, "value_nan")) == [] + assert dn.fit_transform(sbd.col(drop_null_table, "value_null")) == [] + assert dn.fit_transform(sbd.col(drop_null_table, "mixed_null")) == [] + + df_module.assert_column_equal( + dn.fit_transform(sbd.col(drop_null_table, "idx")), + df_module.make_column("idx", [1, 2, 3]), + ) + + df_module.assert_column_equal( + dn.fit_transform(sbd.col(drop_null_table, "value_almost_nan")), + df_module.make_column("value_almost_nan", [2.5, np.nan, np.nan]), + ) + + df_module.assert_column_equal( + dn.fit_transform(sbd.col(drop_null_table, "value_almost_null")), + df_module.make_column("value_almost_null", ["almost", None, None]), + ) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 23a8cff16..cc0f353f8 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -164,6 +164,28 @@ def _get_datetimes_dataframe(): ) +def _get_missing_values_dataframe(categorical_dtype="object"): + """ + Creates a simple DataFrame with some columns that contain only missing values. + We'll use different types of missing values (np.nan, pd.NA, None) + to test how the vectorizer handles full null columns with mixed null values. + """ + return pd.DataFrame( + { + "int": pd.Series([15, 56, pd.NA, 12, 44], dtype="Int64"), + "all_null": pd.Series( + [None, None, None, None, None], dtype=categorical_dtype + ), + "all_nan": pd.Series( + [np.nan, np.nan, np.nan, np.nan, np.nan], dtype="Float64" + ), + "mixed_nulls": pd.Series( + [np.nan, None, pd.NA, "NULL", "NA"], dtype=categorical_dtype + ), + } + ) + + def test_fit_default_transform(): X = _get_clean_dataframe() vectorizer = TableVectorizer() @@ -506,8 +528,11 @@ def test_changing_types(X_train, X_test, expected_X_out): """ table_vec = TableVectorizer( # only extract the total seconds - datetime=DatetimeEncoder(resolution=None) + datetime=DatetimeEncoder(resolution=None), + # True by default + drop_null_columns=False, ) + table_vec.fit(X_train) X_out = table_vec.transform(X_test) assert (X_out.isna() == expected_X_out.isna()).all().all() @@ -734,3 +759,18 @@ def test_supervised_encoder(df_module): y = np.random.default_rng(0).normal(size=sbd.shape(X)[0]) tv = TableVectorizer(low_cardinality=TargetEncoder()) tv.fit_transform(X, y) + + +def test_drop_null_column(): + """Check that all null columns are dropped, and no more.""" + # Don't drop null columns + X = _get_missing_values_dataframe() + tv = TableVectorizer(drop_null_columns=False) + transformed = tv.fit_transform(X) + + assert sbd.shape(transformed) == sbd.shape(X) + + # Drop null columns + tv = TableVectorizer(drop_null_columns=True) + transformed = tv.fit_transform(X) + assert sbd.shape(transformed) == (sbd.shape(X)[0], 1)