diff --git a/bluecast/blueprints/preprocessing_recipes.py b/bluecast/blueprints/preprocessing_recipes.py index 27c53a0f..eefc47ed 100644 --- a/bluecast/blueprints/preprocessing_recipes.py +++ b/bluecast/blueprints/preprocessing_recipes.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import PowerTransformer from bluecast.preprocessing.custom import CustomPreprocessing from bluecast.preprocessing.remove_collinearity import remove_correlated_columns @@ -13,7 +13,7 @@ class PreprocessingForLinearModels(CustomPreprocessing): def __init__(self, num_columns: Optional[List]): super().__init__() self.missing_val_imputer = SimpleImputer(missing_values=np.nan, strategy="mean") - self.scaler = StandardScaler() + self.scaler = PowerTransformer(method="yeo-johnson") if isinstance(num_columns, List): self.num_columns = num_columns @@ -24,25 +24,26 @@ def __init__(self, num_columns: Optional[List]): def fit_transform( self, df: pd.DataFrame, target: pd.Series ) -> Tuple[pd.DataFrame, pd.Series]: - if len(self.num_columns) == 0: - self.num_columns = df.columns.to_list() df.loc[:, self.num_columns] = df.loc[:, self.num_columns].replace( [np.inf, -np.inf], np.nan ) - df.loc[:, self.num_columns] = self.missing_val_imputer.fit_transform( - df.loc[:, self.num_columns] - ) - df.loc[:, self.num_columns] = self.scaler.fit_transform( - df.loc[:, self.num_columns] - ) + if len(self.num_columns) > 0: + df.loc[:, self.num_columns] = self.missing_val_imputer.fit_transform( + df.loc[:, self.num_columns] + ) + df.loc[:, self.num_columns] = self.scaler.fit_transform( + df.loc[:, self.num_columns] + ) df_non_numerical = df.loc[ :, [col for col in df.columns.to_list() if col not in self.num_columns] ] - df = remove_correlated_columns(df.loc[:, self.num_columns], 0.9) + self.non_correlated_columns = remove_correlated_columns( + df.loc[:, self.num_columns], 0.9 + ).columns.to_list() df_numerical = df.loc[:, self.non_correlated_columns] self.non_correlated_columns = df_numerical.columns.to_list() @@ -60,10 +61,13 @@ def transform( [np.inf, -np.inf], np.nan ) - df.loc[:, self.num_columns] = self.missing_val_imputer.transform( - df.loc[:, self.num_columns] - ) - df.loc[:, self.num_columns] = self.scaler.transform(df.loc[:, self.num_columns]) + if len(self.num_columns) > 0: + df.loc[:, self.num_columns] = self.missing_val_imputer.transform( + df.loc[:, self.num_columns] + ) + df.loc[:, self.num_columns] = self.scaler.transform( + df.loc[:, self.num_columns] + ) df_non_numerical = df.loc[ :, [col for col in df.columns.to_list() if col not in self.num_columns] diff --git a/bluecast/preprocessing/remove_collinearity.py b/bluecast/preprocessing/remove_collinearity.py index a77262b3..dac25ccc 100644 --- a/bluecast/preprocessing/remove_collinearity.py +++ b/bluecast/preprocessing/remove_collinearity.py @@ -1,3 +1,5 @@ +import logging + import pandas as pd @@ -14,11 +16,14 @@ def remove_correlated_columns(df: pd.DataFrame, threshold: float = 0.9): corr_matrix = df.corr() for i in range(len(corr_matrix.columns)): for j in range(i): - if (corr_matrix.iloc[i, j] >= threshold) and ( - corr_matrix.columns[j] not in col_corr + if ( + (corr_matrix.iloc[i, j] >= threshold) + and (corr_matrix.columns[j] not in col_corr) + and (corr_matrix.columns[i] not in col_corr) ): colname = corr_matrix.columns[i] # getting the name of column col_corr.add(colname) if colname in df.columns: del df[colname] # deleting the column from the df + logging.info(f"Removed the following collinear columns: {col_corr}") return df diff --git a/bluecast/tests/test_custom_model_recipes.py b/bluecast/tests/test_custom_model_recipes.py new file mode 100644 index 00000000..528ac708 --- /dev/null +++ b/bluecast/tests/test_custom_model_recipes.py @@ -0,0 +1,79 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split + +from bluecast.blueprints.custom_model_recipes import LogisticRegressionModel + + +@pytest.fixture +def data(): + # Generate a synthetic binary classification dataset + X, y = make_classification( + n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42 + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + return ( + pd.DataFrame(X_train), + pd.DataFrame(X_test), + pd.Series(y_train), + pd.Series(y_test), + ) + + +@pytest.fixture +def model(): + return LogisticRegressionModel(random_state=42) + + +def test_autotune(data, model): + X_train, X_test, y_train, y_test = data + model.autotune(X_train, X_test, y_train, y_test) + + # Check if the model has been fitted by inspecting the attributes + assert hasattr( + model.model, "coef_" + ), "Model should have been fitted and have coefficients." + + +def test_fit(data, model): + X_train, X_test, y_train, y_test = data + model.fit(X_train, X_test, y_train, y_test) + + # Again, check if the model has been fitted + assert hasattr( + model.model, "coef_" + ), "Model should have been fitted after calling fit method." + + +def test_predict(data, model): + X_train, X_test, y_train, y_test = data + model.fit(X_train, X_test, y_train, y_test) + + probas, classes = model.predict(X_test) + + # Check the types of the returned values + assert isinstance( + probas, np.ndarray + ), "Predicted probabilities should be a numpy array." + assert isinstance(classes, np.ndarray), "Predicted classes should be a numpy array." + + # Check the shape of the returned values + assert probas.shape == ( + X_test.shape[0], + ), "Predicted probabilities should have the correct shape." + assert classes.shape == ( + X_test.shape[0], + ), "Predicted classes should have the correct shape." + + # Check if values are within the expected range + assert np.all( + (probas >= 0) & (probas <= 1) + ), "Predicted probabilities should be between 0 and 1." + assert np.all( + (classes == 0) | (classes == 1) + ), "Predicted classes should be either 0 or 1." diff --git a/bluecast/tests/test_preprocessing_recipes.py b/bluecast/tests/test_preprocessing_recipes.py new file mode 100644 index 00000000..e82c6b53 --- /dev/null +++ b/bluecast/tests/test_preprocessing_recipes.py @@ -0,0 +1,107 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.datasets import make_classification + +from bluecast.blueprints.preprocessing_recipes import PreprocessingForLinearModels +from bluecast.preprocessing.custom import CustomPreprocessing + + +# Mocking remove_correlated_columns for testing purposes +def mock_remove_correlated_columns(df, threshold): + return df.loc[:, df.columns[:-1]] # Just drop the last column for simplicity + + +@pytest.fixture +def sample_data(): + # Create a sample DataFrame with numerical data + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(5)]) + target = pd.Series(y, name="target") + return df, target + + +@pytest.fixture +def preprocessing_instance(): + # Create an instance of PreprocessingForLinearModels + return PreprocessingForLinearModels( + num_columns=["feature_0", "feature_1", "feature_2", "feature_3", "feature_4"] + ) + + +def test_initialization(preprocessing_instance): + # Test if the class initializes correctly + assert isinstance(preprocessing_instance, CustomPreprocessing) + assert preprocessing_instance.num_columns == [ + "feature_0", + "feature_1", + "feature_2", + "feature_3", + "feature_4", + ] + assert preprocessing_instance.non_correlated_columns == [] + + +def test_fit_transform(sample_data, preprocessing_instance, monkeypatch): + df, target = sample_data + + # Mock the remove_correlated_columns function + monkeypatch.setattr( + "bluecast.preprocessing.remove_collinearity.remove_correlated_columns", + mock_remove_correlated_columns, + ) + + transformed_df, transformed_target = preprocessing_instance.fit_transform( + df, target + ) + + # Check if transformed data has the correct shape and type + assert isinstance(transformed_df, pd.DataFrame) + assert isinstance(transformed_target, pd.Series) + assert transformed_df.shape == (100, 4) # Since one column is removed by mock + assert transformed_target.shape == (100,) + + # Check if missing values and infinite values are handled correctly + assert not transformed_df.isnull().any().any() + assert not np.isinf(transformed_df).any().any() + + +def test_transform(sample_data, preprocessing_instance, monkeypatch): + df, target = sample_data + + # Fit-transform first to simulate the normal flow + monkeypatch.setattr( + "bluecast.preprocessing.remove_collinearity.remove_correlated_columns", + mock_remove_correlated_columns, + ) + preprocessing_instance.fit_transform(df, target) + + # Now transform new data + new_df = df.copy() + new_df.loc[0, "feature_0"] = np.nan # Introduce missing value + + transformed_df, transformed_target = preprocessing_instance.transform( + new_df, target + ) + + # Check if transformed data has the correct shape and type + assert isinstance(transformed_df, pd.DataFrame) + assert isinstance(transformed_target, pd.Series) + assert transformed_df.shape == (100, 4) + assert transformed_target.shape == (100,) + + # Check if missing values and infinite values are handled correctly + assert not transformed_df.isnull().any().any() + assert not np.isinf(transformed_df).any().any() + + +def test_no_numerical_columns(): + df = pd.DataFrame({"category": ["A", "B", "C"], "binary": [1, 0, 1]}) + target = pd.Series([1, 0, 1]) + + preprocessing = PreprocessingForLinearModels(num_columns=[]) + transformed_df, transformed_target = preprocessing.fit_transform(df, target) + + # Since there are no numerical columns, the DataFrame should remain unchanged + pd.testing.assert_frame_equal(transformed_df, df) + pd.testing.assert_series_equal(transformed_target, target) diff --git a/dist/bluecast-1.6.0-py3-none-any.whl b/dist/bluecast-1.6.0-py3-none-any.whl index 227cf8f0..167ef910 100644 Binary files a/dist/bluecast-1.6.0-py3-none-any.whl and b/dist/bluecast-1.6.0-py3-none-any.whl differ diff --git a/dist/bluecast-1.6.0.tar.gz b/dist/bluecast-1.6.0.tar.gz index bfe1820f..f9567820 100644 Binary files a/dist/bluecast-1.6.0.tar.gz and b/dist/bluecast-1.6.0.tar.gz differ