diff --git a/README.md b/README.md index 8f01bfd1c..594566859 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,8 @@ Please share your story by answering 1 quick question * EqualWidthDiscretiser * GeometricWidthDiscretiser * DecisionTreeDiscretiser -* ArbitraryDiscreriser +* ArbitraryDiscretiser +* BinaryDiscretiser ### Outlier Handling methods * Winsorizer diff --git a/docs/api_doc/discretisation/BinaryDiscretiser.rst b/docs/api_doc/discretisation/BinaryDiscretiser.rst new file mode 100644 index 000000000..53d0d14e8 --- /dev/null +++ b/docs/api_doc/discretisation/BinaryDiscretiser.rst @@ -0,0 +1,5 @@ +BinaryDiscretiser +================= + +.. autoclass:: feature_engine.discretisation.BinaryDiscretiser + :members: diff --git a/docs/api_doc/discretisation/index.rst b/docs/api_doc/discretisation/index.rst index 75c484e9d..9fad0bd14 100644 --- a/docs/api_doc/discretisation/index.rst +++ b/docs/api_doc/discretisation/index.rst @@ -18,6 +18,7 @@ into continuous intervals. :class:`ArbitraryDiscretiser()` Sorts values into intervals predefined by the user. :class:`DecisionTreeDiscretiser()` Replaces values by predictions of a decision tree, which are discrete. :class:`GeometricWidthDiscretiser()` Sorts variable into geometrical intervals. +:class:`BinaryDiscretiser()` Sorts variable into two intervals determined by a threshold. ===================================== ======================================================================== diff --git a/docs/index.rst b/docs/index.rst index e30eb7eb0..fa8e15668 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -183,6 +183,7 @@ discretization with decision trees: - :doc:`api_doc/discretisation/EqualWidthDiscretiser`: sorts variable into equal width intervals - :doc:`api_doc/discretisation/DecisionTreeDiscretiser`: uses decision trees to create finite variables - :doc:`api_doc/discretisation/GeometricWidthDiscretiser`: sorts variable into geometrical intervals +- :doc:`api_doc/discretisation/BinaryDiscretiser`: sorts variable into two intervals determined by a threshold Outlier Capping or Removal ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/feature_engine/discretisation/__init__.py b/feature_engine/discretisation/__init__.py index 5016d1aa9..9d1e602e6 100644 --- a/feature_engine/discretisation/__init__.py +++ b/feature_engine/discretisation/__init__.py @@ -8,6 +8,7 @@ from .equal_frequency import EqualFrequencyDiscretiser from .equal_width import EqualWidthDiscretiser from .geometric_width import GeometricWidthDiscretiser +from .binariser import BinaryDiscretiser __all__ = [ "DecisionTreeDiscretiser", @@ -15,4 +16,5 @@ "EqualWidthDiscretiser", "ArbitraryDiscretiser", "GeometricWidthDiscretiser", + "BinaryDiscretiser", ] diff --git a/feature_engine/discretisation/binariser.py b/feature_engine/discretisation/binariser.py new file mode 100644 index 000000000..d6e41570f --- /dev/null +++ b/feature_engine/discretisation/binariser.py @@ -0,0 +1,231 @@ +from typing import List, Optional, Union + +import numpy as np +import pandas as pd + +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine._docstrings.fit_attributes import ( + _binner_dict_docstring, + _feature_names_in_docstring, + _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.init_parameters.all_trasnformers import ( + _variables_numerical_docstring, +) +from feature_engine._docstrings.init_parameters.discretisers import ( + _precision_docstring, + _return_boundaries_docstring, + _return_object_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_discretiser_docstring, + _fit_transform_docstring, + _transform_discretiser_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.discretisation.base_discretiser import BaseDiscretiser + + +@Substitution( + return_object=_return_object_docstring, + return_boundaries=_return_boundaries_docstring, + precision=_precision_docstring, + binner_dict_=_binner_dict_docstring, + fit=_fit_discretiser_docstring, + transform=_transform_discretiser_docstring, + variables=_variables_numerical_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit_transform=_fit_transform_docstring, +) +class BinaryDiscretiser(BaseDiscretiser): + """ + The BinaryDiscretiser() divides continuous numerical variables into two intervals, + where the value `threshold`, the point at which the interval is divided, is + determined by the user. + + The BinaryDiscretiser() works only with numerical variables. + A list of variables can be passed as argument. Alternatively, the discretiser + will automatically select all numerical variables. + + The BinaryDiscretiser() first finds the boundaries for the intervals for + each variable. Then, it transforms the variables, that is, sorts the values into + the intervals. + + Parameters + ---------- + {variables} + + threshold: int, float, default=None + Desired value at which to divide the interval. + + {return_object} + + {return_boundaries} + + {precision} + + Attributes + ---------- + {binner_dict_} + + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + {transform} + + See Also + -------- + pandas.cut + sklearn.preprocessing.KBinsDiscretizer + + References + ---------- + .. [1] Kotsiantis and Pintelas, "Data preprocessing for supervised leaning," + International Journal of Computer Science, vol. 1, pp. 111 117, 2006. + + .. [2] Dong. "Beating Kaggle the easy way". Master Thesis. + https://www.ke.tu-darmstadt.de/lehre/arbeiten/studien/2015/Dong_Ying.pdf + + Examples + -------- + + >>> import pandas as pd + >>> import numpy as np + >>> from feature_engine.discretisation import EqualWidthDiscretiser + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.randint(1,100, 100))) + >>> transformer = BinaryDiscretiser(threshold=50) + >>> transformer.fit(X) + >>> transformer.transform(X)['x'].value_counts() + x + 1 56 + 0 44 + Name: count, dtype: int64 + """ + + def __init__( + self, + threshold: Union[None, int, float] = None, + variables: Union[None, int, str, List[Union[str, int]]] = None, + return_object: bool = False, + return_boundaries: bool = False, + precision: int = 3, + ) -> None: + + if threshold is None: + raise TypeError( + "threshold not supplied." + " Please provide a threshold of type float or int." + ) + + if not isinstance(threshold, (int, float)): + raise TypeError( + "threshold must be an integer or a float." + f" Got type '{type(threshold).__name__}' instead." + ) + + super().__init__(return_object, return_boundaries, precision) + + self.variables = _check_variables_input_value(variables) + self.threshold = threshold + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Learn the boundaries of the bins for each + variable. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training dataset. Can be the entire dataframe, not just the variables + to be transformed. + y: None + y is not needed in this encoder. You can pass y or None. + """ + + # check input dataframe + X = super().fit(X) + + failed_threshold_check = [] + self.binner_dict_ = {} + for var in self.variables_: + # Check that threshold is within range + if (self.threshold < min(X[var])) or (self.threshold > max(X[var])): + # Omit these features from transformation step + failed_threshold_check.append(var) + else: + self.binner_dict_[var] = [ + float("-inf"), + np.float64(self.threshold), + float("inf"), + ] + + if failed_threshold_check: + print( + "threshold outside of range for one or more variables." + f" Features {failed_threshold_check} have not been transformed." + ) + + # A list of features that satisfy threshold check and will be transformed + self.variables_trans_ = [ + var for var in self.variables_ if var not in failed_threshold_check + ] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """Sort the variable values into the intervals. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe of shape = [n_samples, n_features] + The transformed data with the discrete variables. + """ + + # check input dataframe and if class was fitted + X = self._check_transform_input_and_state(X) + + # transform variables + if self.return_boundaries is True: + for feature in self.variables_trans_: + X[feature] = pd.cut( + X[feature], + self.binner_dict_[feature], + precision=self.precision, + include_lowest=True, + ) + X[self.variables_trans_] = X[self.variables_trans_].astype(str) + + else: + for feature in self.variables_trans_: + X[feature] = pd.cut( + X[feature], + self.binner_dict_[feature], + labels=False, + include_lowest=True, + ) + + # return object + if self.return_object: + X[self.variables_trans_] = X[self.variables_trans_].astype("O") + + return X diff --git a/tests/test_discretisation/test_binariser.py b/tests/test_discretisation/test_binariser.py new file mode 100644 index 000000000..bc0dda892 --- /dev/null +++ b/tests/test_discretisation/test_binariser.py @@ -0,0 +1,84 @@ +import numpy as np +import pytest +from sklearn.exceptions import NotFittedError + +from feature_engine.discretisation import BinaryDiscretiser + + +def test_automatically_find_variables_and_return_as_numeric(df_normal_dist): + # test case 1: automatically select variables, return_object=False + transformer = BinaryDiscretiser(threshold=0, variables=None, return_object=False) + X = transformer.fit_transform(df_normal_dist) + + # transform input + Xt = np.where(df_normal_dist["var"] > 0, 1, 0) + bins = [float("-inf"), np.float64(0), float("inf")] + + # init params + assert transformer.threshold == 0 + assert transformer.variables is None + assert transformer.return_object is False + # fit params + assert transformer.variables_ == ["var"] + assert transformer.n_features_in_ == 1 + assert transformer.binner_dict_["var"] == bins + # check transformed output against Xt + assert all(x == y for x, y in zip(X["var"].values, Xt)) + + +def test_automatically_find_variables_and_return_as_object(df_normal_dist): + transformer = BinaryDiscretiser(threshold=0, variables=None, return_object=True) + X = transformer.fit_transform(df_normal_dist) + assert X["var"].dtypes == "O" + + +def test_error_when_threshold_not_int_or_float(): + with pytest.raises(TypeError): + BinaryDiscretiser(threshold="other") + + +def test_error_when_threshold_not_supplied(): + with pytest.raises(TypeError): + BinaryDiscretiser() + + +def test_error_if_return_object_not_bool(): + with pytest.raises(ValueError): + BinaryDiscretiser(threshold=0, return_object="other") + + +def test_error_if_input_df_contains_na_in_fit(df_na): + # test case 3: when dataset contains na, fit method + with pytest.raises(ValueError): + transformer = BinaryDiscretiser(threshold=0) + transformer.fit(df_na) + + +def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na): + # test case 4: when dataset contains na, transform method + with pytest.raises(ValueError): + transformer = BinaryDiscretiser(threshold=0) + transformer.fit(df_vartypes) + transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]]) + + +def test_non_fitted_error(df_vartypes): + with pytest.raises(NotFittedError): + transformer = BinaryDiscretiser(threshold=0) + transformer.transform(df_vartypes) + + +def test_stout_threshold_out_of_range(df_vartypes, capsys): + transformer = BinaryDiscretiser(threshold=20, variables=None, return_object=False) + _ = transformer.fit_transform(df_vartypes[["Age", "Marks"]]) + captured = capsys.readouterr() + assert ( + captured.out + == "threshold outside of range for one or more variables. Features ['Marks'] have not been transformed.\n" + ) + + +def test_return_boundaries(df_normal_dist): + transformer = BinaryDiscretiser(threshold=0, return_boundaries=True) + Xt = transformer.fit_transform(df_normal_dist) + assert all(x for x in df_normal_dist["var"].unique() if x not in Xt)