Skip to content

Commit

Permalink
Added DropNullColumn transformer to remove columns that contain only …
Browse files Browse the repository at this point in the history
…nulls (skrub-data#1115)
  • Loading branch information
rcap107 authored Nov 18, 2024
1 parent 5d59bf5 commit 2cdf8ad
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 9 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ Minor changes
is now always visible when scrolling the table. :pr:`1102` by :user:`Jérôme
Dockès <jeromedockes>`.

* Added a `DropColumnIfNull` transformer that drops columns that contain only null
values. :pr:`1115` by :user: `Riccardo Cappuzzo <riccardocappuzzo>`

Bug fixes
---------

Expand Down
23 changes: 23 additions & 0 deletions skrub/_dataframe/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"to_datetime",
"is_categorical",
"to_categorical",
"is_all_null",
#
# Inspecting, selecting and modifying values
#
Expand Down Expand Up @@ -841,6 +842,28 @@ def _to_categorical_polars(col):
return _cast_polars(col, pl.Categorical())


@dispatch
def is_all_null(col):
raise NotImplementedError()


@is_all_null.specialize("pandas", argument_type="Column")
def _is_all_null_pandas(col):
return all(is_null(col))


@is_all_null.specialize("polars", argument_type="Column")
def _is_all_null_polars(col):
# Column type is Null
if col.dtype == pl.Null:
return True
# Column type is not Null, but all values are nulls: more efficient
if col.null_count() == col.len():
return True
# Column type is not Null, not all values are null (check if NaN etc.): slower
return all(is_null(col))


#
# Inspecting, selecting and modifying values
# ==========================================
Expand Down
27 changes: 27 additions & 0 deletions skrub/_dataframe/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,33 @@ def test_to_categorical(df_module):
assert list(s.cat.categories) == list("ab")


def test_is_all_null(df_module):
"""Check that is_all_null is evaluating null counts correctly."""

# Check that all null columns are marked as "all null"
assert ns.is_all_null(df_module.make_column("all_null", [None, None, None]))
assert ns.is_all_null(df_module.make_column("all_nan", [np.nan, np.nan, np.nan]))
assert ns.is_all_null(
df_module.make_column("all_nan_or_null", [np.nan, np.nan, None])
)

# Check that the other columns are *not* marked as "all null"
assert not ns.is_all_null(
df_module.make_column("almost_all_null", ["almost", None, None])
)
assert not ns.is_all_null(
df_module.make_column("almost_all_nan", [2.5, None, None])
)


def test_is_all_null_polars(pl_module):
"""Special case for polars: column is full of nulls, but doesn't have dtype Null"""
col = pl_module.make_column("col", [1, None, None])
col = col[1:]

assert ns.is_all_null(col)


# Inspecting, selecting and modifying values
# ==========================================
#
Expand Down
48 changes: 48 additions & 0 deletions skrub/_drop_column_if_null.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# drop columns that contain all null values
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from ._on_each_column import SingleColumnTransformer

__all__ = ["DropColumnIfNull"]


class DropColumnIfNull(SingleColumnTransformer):
"""Drop a single column if it contains only Null, NaN, or a mixture of null
values. If at least one non-null value is found, the column is kept."""

def fit_transform(self, column, y=None):
"""Fit the encoder and transform a column.
Parameters
----------
column : Pandas or Polars series. The input column to check.
y : None. Ignored.
Returns
-------
The input column, or an empty list if the column contains only null values.
"""
del y

self.drop_ = sbd.is_all_null(column)

return self.transform(column)

def transform(self, column):
"""Transform a column.
Parameters:
-----------
column : Pandas or Polars series. The input column to check.
Returns
-------
column
The input column, or an empty list if the column contains only null values.
"""
check_is_fitted(self)

if self.drop_:
return []
return column
29 changes: 21 additions & 8 deletions skrub/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ._clean_categories import CleanCategories
from ._clean_null_strings import CleanNullStrings
from ._datetime_encoder import DatetimeEncoder
from ._drop_column_if_null import DropColumnIfNull
from ._gap_encoder import GapEncoder
from ._on_each_column import SingleColumnTransformer
from ._select_cols import Drop
Expand Down Expand Up @@ -191,6 +192,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
similar functionality to what is offered by scikit-learn's
:class:`~sklearn.compose.ColumnTransformer`.
drop_null_columns : bool, default=True
If set to `True`, columns that contain only null values are dropped.
n_jobs : int, default=None
Number of jobs to run in parallel.
``None`` means 1 unless in a joblib ``parallel_backend`` context.
Expand Down Expand Up @@ -309,12 +313,13 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
Before applying the main transformer, the ``TableVectorizer`` applies
several preprocessing steps, for example to detect numbers or dates that are
represented as strings. Moreover, a final post-processing step is applied to
all non-categorical columns in the encoder's output to cast them to float32.
represented as strings. By default, columns that contain only null values are
dropped. Moreover, a final post-processing step is applied to all
non-categorical columns in the encoder's output to cast them to float32.
We can inspect all the processing steps that were applied to a given column:
>>> vectorizer.all_processing_steps_['B']
[CleanNullStrings(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat32(), 'B_month': ToFloat32(), ...}]
[CleanNullStrings(), DropColumnIfNull(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat32(), 'B_month': ToFloat32(), ...}]
Note that as the encoder (``DatetimeEncoder()`` above) produces multiple
columns, the last processing step is not described by a single transformer
Expand All @@ -323,7 +328,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
``all_processing_steps_`` is useful to inspect the details of the
choices made by the ``TableVectorizer`` during preprocessing, for example:
>>> vectorizer.all_processing_steps_['B'][1]
>>> vectorizer.all_processing_steps_['B'][2]
ToDatetime()
>>> _.format_
'%d/%m/%Y'
Expand Down Expand Up @@ -389,7 +394,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
``ToDatetime()``:
>>> vectorizer.all_processing_steps_
{'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), ToFloat32(), PassThrough(), {'C': ToFloat32()}]}
{'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropColumnIfNull(), ToFloat32(), PassThrough(), {'C': ToFloat32()}]}
Specifying several ``specific_transformers`` for the same column is not allowed.
Expand All @@ -412,6 +417,7 @@ def __init__(
numeric=NUMERIC_TRANSFORMER,
datetime=DATETIME_TRANSFORMER,
specific_transformers=(),
drop_null_columns=True,
n_jobs=None,
):
self.cardinality_threshold = cardinality_threshold
Expand All @@ -425,6 +431,7 @@ def __init__(
self.datetime = _utils.clone_if_default(datetime, DATETIME_TRANSFORMER)
self.specific_transformers = specific_transformers
self.n_jobs = n_jobs
self.drop_null_columns = drop_null_columns

def fit(self, X, y=None):
"""Fit transformer.
Expand Down Expand Up @@ -536,13 +543,19 @@ def add_step(steps, transformer, cols, allow_reject=False):
cols = s.all() - self._specific_columns

self._preprocessors = [CheckInputDataFrame()]
for transformer in [
CleanNullStrings(),

transformer_list = [CleanNullStrings()]
if self.drop_null_columns:
transformer_list.append(DropColumnIfNull())

transformer_list += [
ToDatetime(),
ToFloat32(),
CleanCategories(),
ToStr(),
]:
]

for transformer in transformer_list:
add_step(self._preprocessors, transformer, cols, allow_reject=True)

self._encoders = []
Expand Down
62 changes: 62 additions & 0 deletions skrub/tests/test_drop_column_if_null.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np
import pytest

from skrub import _dataframe as sbd
from skrub._drop_column_if_null import DropColumnIfNull


@pytest.fixture
def drop_null_table(df_module):
return df_module.make_dataframe(
{
"idx": [
1,
2,
3,
],
"value_nan": [
np.nan,
np.nan,
np.nan,
],
"value_null": [
None,
None,
None,
],
"value_almost_nan": [
2.5,
np.nan,
np.nan,
],
"value_almost_null": [
"almost",
None,
None,
],
"mixed_null": [None, np.nan, None],
}
)


def test_single_column(drop_null_table, df_module):
"""Check that null columns are dropped and non-null columns are kept."""
dn = DropColumnIfNull()
assert dn.fit_transform(sbd.col(drop_null_table, "value_nan")) == []
assert dn.fit_transform(sbd.col(drop_null_table, "value_null")) == []
assert dn.fit_transform(sbd.col(drop_null_table, "mixed_null")) == []

df_module.assert_column_equal(
dn.fit_transform(sbd.col(drop_null_table, "idx")),
df_module.make_column("idx", [1, 2, 3]),
)

df_module.assert_column_equal(
dn.fit_transform(sbd.col(drop_null_table, "value_almost_nan")),
df_module.make_column("value_almost_nan", [2.5, np.nan, np.nan]),
)

df_module.assert_column_equal(
dn.fit_transform(sbd.col(drop_null_table, "value_almost_null")),
df_module.make_column("value_almost_null", ["almost", None, None]),
)
42 changes: 41 additions & 1 deletion skrub/tests/test_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,28 @@ def _get_datetimes_dataframe():
)


def _get_missing_values_dataframe(categorical_dtype="object"):
"""
Creates a simple DataFrame with some columns that contain only missing values.
We'll use different types of missing values (np.nan, pd.NA, None)
to test how the vectorizer handles full null columns with mixed null values.
"""
return pd.DataFrame(
{
"int": pd.Series([15, 56, pd.NA, 12, 44], dtype="Int64"),
"all_null": pd.Series(
[None, None, None, None, None], dtype=categorical_dtype
),
"all_nan": pd.Series(
[np.nan, np.nan, np.nan, np.nan, np.nan], dtype="Float64"
),
"mixed_nulls": pd.Series(
[np.nan, None, pd.NA, "NULL", "NA"], dtype=categorical_dtype
),
}
)


def test_fit_default_transform():
X = _get_clean_dataframe()
vectorizer = TableVectorizer()
Expand Down Expand Up @@ -506,8 +528,11 @@ def test_changing_types(X_train, X_test, expected_X_out):
"""
table_vec = TableVectorizer(
# only extract the total seconds
datetime=DatetimeEncoder(resolution=None)
datetime=DatetimeEncoder(resolution=None),
# True by default
drop_null_columns=False,
)

table_vec.fit(X_train)
X_out = table_vec.transform(X_test)
assert (X_out.isna() == expected_X_out.isna()).all().all()
Expand Down Expand Up @@ -734,3 +759,18 @@ def test_supervised_encoder(df_module):
y = np.random.default_rng(0).normal(size=sbd.shape(X)[0])
tv = TableVectorizer(low_cardinality=TargetEncoder())
tv.fit_transform(X, y)


def test_drop_null_column():
"""Check that all null columns are dropped, and no more."""
# Don't drop null columns
X = _get_missing_values_dataframe()
tv = TableVectorizer(drop_null_columns=False)
transformed = tv.fit_transform(X)

assert sbd.shape(transformed) == sbd.shape(X)

# Drop null columns
tv = TableVectorizer(drop_null_columns=True)
transformed = tv.fit_transform(X)
assert sbd.shape(transformed) == (sbd.shape(X)[0], 1)

0 comments on commit 2cdf8ad

Please sign in to comment.