Som eunit tests and bug fixes

ThomasMeissnerDS · Aug 18, 2024 · 60dc32f · 60dc32f
1 parent 1d40084
commit 60dc32f
Show file tree

Hide file tree

Showing 6 changed files with 212 additions and 17 deletions.
diff --git a/bluecast/blueprints/preprocessing_recipes.py b/bluecast/blueprints/preprocessing_recipes.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import PowerTransformer
 
 from bluecast.preprocessing.custom import CustomPreprocessing
 from bluecast.preprocessing.remove_collinearity import remove_correlated_columns
@@ -13,7 +13,7 @@ class PreprocessingForLinearModels(CustomPreprocessing):
     def __init__(self, num_columns: Optional[List]):
         super().__init__()
         self.missing_val_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
-        self.scaler = StandardScaler()
+        self.scaler = PowerTransformer(method="yeo-johnson")
 
         if isinstance(num_columns, List):
             self.num_columns = num_columns
@@ -24,25 +24,26 @@ def __init__(self, num_columns: Optional[List]):
     def fit_transform(
         self, df: pd.DataFrame, target: pd.Series
     ) -> Tuple[pd.DataFrame, pd.Series]:
-        if len(self.num_columns) == 0:
-            self.num_columns = df.columns.to_list()
 
         df.loc[:, self.num_columns] = df.loc[:, self.num_columns].replace(
             [np.inf, -np.inf], np.nan
         )
 
-        df.loc[:, self.num_columns] = self.missing_val_imputer.fit_transform(
-            df.loc[:, self.num_columns]
-        )
-        df.loc[:, self.num_columns] = self.scaler.fit_transform(
-            df.loc[:, self.num_columns]
-        )
+        if len(self.num_columns) > 0:
+            df.loc[:, self.num_columns] = self.missing_val_imputer.fit_transform(
+                df.loc[:, self.num_columns]
+            )
+            df.loc[:, self.num_columns] = self.scaler.fit_transform(
+                df.loc[:, self.num_columns]
+            )
 
         df_non_numerical = df.loc[
             :, [col for col in df.columns.to_list() if col not in self.num_columns]
         ]
 
-        df = remove_correlated_columns(df.loc[:, self.num_columns], 0.9)
+        self.non_correlated_columns = remove_correlated_columns(
+            df.loc[:, self.num_columns], 0.9
+        ).columns.to_list()
         df_numerical = df.loc[:, self.non_correlated_columns]
         self.non_correlated_columns = df_numerical.columns.to_list()
 
@@ -60,10 +61,13 @@ def transform(
             [np.inf, -np.inf], np.nan
         )
 
-        df.loc[:, self.num_columns] = self.missing_val_imputer.transform(
-            df.loc[:, self.num_columns]
-        )
-        df.loc[:, self.num_columns] = self.scaler.transform(df.loc[:, self.num_columns])
+        if len(self.num_columns) > 0:
+            df.loc[:, self.num_columns] = self.missing_val_imputer.transform(
+                df.loc[:, self.num_columns]
+            )
+            df.loc[:, self.num_columns] = self.scaler.transform(
+                df.loc[:, self.num_columns]
+            )
 
         df_non_numerical = df.loc[
             :, [col for col in df.columns.to_list() if col not in self.num_columns]

diff --git a/bluecast/preprocessing/remove_collinearity.py b/bluecast/preprocessing/remove_collinearity.py
@@ -1,3 +1,5 @@
+import logging
+
 import pandas as pd
 
 
@@ -14,11 +16,14 @@ def remove_correlated_columns(df: pd.DataFrame, threshold: float = 0.9):
     corr_matrix = df.corr()
     for i in range(len(corr_matrix.columns)):
         for j in range(i):
-            if (corr_matrix.iloc[i, j] >= threshold) and (
-                corr_matrix.columns[j] not in col_corr
+            if (
+                (corr_matrix.iloc[i, j] >= threshold)
+                and (corr_matrix.columns[j] not in col_corr)
+                and (corr_matrix.columns[i] not in col_corr)
             ):
                 colname = corr_matrix.columns[i]  # getting the name of column
                 col_corr.add(colname)
                 if colname in df.columns:
                     del df[colname]  # deleting the column from the df
+    logging.info(f"Removed the following collinear columns: {col_corr}")
     return df
diff --git a/bluecast/tests/test_custom_model_recipes.py b/bluecast/tests/test_custom_model_recipes.py
@@ -0,0 +1,79 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+
+from bluecast.blueprints.custom_model_recipes import LogisticRegressionModel
+
+
+@pytest.fixture
+def data():
+    # Generate a synthetic binary classification dataset
+    X, y = make_classification(
+        n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    return (
+        pd.DataFrame(X_train),
+        pd.DataFrame(X_test),
+        pd.Series(y_train),
+        pd.Series(y_test),
+    )
+
+
+@pytest.fixture
+def model():
+    return LogisticRegressionModel(random_state=42)
+
+
+def test_autotune(data, model):
+    X_train, X_test, y_train, y_test = data
+    model.autotune(X_train, X_test, y_train, y_test)
+
+    # Check if the model has been fitted by inspecting the attributes
+    assert hasattr(
+        model.model, "coef_"
+    ), "Model should have been fitted and have coefficients."
+
+
+def test_fit(data, model):
+    X_train, X_test, y_train, y_test = data
+    model.fit(X_train, X_test, y_train, y_test)
+
+    # Again, check if the model has been fitted
+    assert hasattr(
+        model.model, "coef_"
+    ), "Model should have been fitted after calling fit method."
+
+
+def test_predict(data, model):
+    X_train, X_test, y_train, y_test = data
+    model.fit(X_train, X_test, y_train, y_test)
+
+    probas, classes = model.predict(X_test)
+
+    # Check the types of the returned values
+    assert isinstance(
+        probas, np.ndarray
+    ), "Predicted probabilities should be a numpy array."
+    assert isinstance(classes, np.ndarray), "Predicted classes should be a numpy array."
+
+    # Check the shape of the returned values
+    assert probas.shape == (
+        X_test.shape[0],
+    ), "Predicted probabilities should have the correct shape."
+    assert classes.shape == (
+        X_test.shape[0],
+    ), "Predicted classes should have the correct shape."
+
+    # Check if values are within the expected range
+    assert np.all(
+        (probas >= 0) & (probas <= 1)
+    ), "Predicted probabilities should be between 0 and 1."
+    assert np.all(
+        (classes == 0) | (classes == 1)
+    ), "Predicted classes should be either 0 or 1."
diff --git a/bluecast/tests/test_preprocessing_recipes.py b/bluecast/tests/test_preprocessing_recipes.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.datasets import make_classification
+
+from bluecast.blueprints.preprocessing_recipes import PreprocessingForLinearModels
+from bluecast.preprocessing.custom import CustomPreprocessing
+
+
+# Mocking remove_correlated_columns for testing purposes
+def mock_remove_correlated_columns(df, threshold):
+    return df.loc[:, df.columns[:-1]]  # Just drop the last column for simplicity
+
+
+@pytest.fixture
+def sample_data():
+    # Create a sample DataFrame with numerical data
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+    df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(5)])
+    target = pd.Series(y, name="target")
+    return df, target
+
+
+@pytest.fixture
+def preprocessing_instance():
+    # Create an instance of PreprocessingForLinearModels
+    return PreprocessingForLinearModels(
+        num_columns=["feature_0", "feature_1", "feature_2", "feature_3", "feature_4"]
+    )
+
+
+def test_initialization(preprocessing_instance):
+    # Test if the class initializes correctly
+    assert isinstance(preprocessing_instance, CustomPreprocessing)
+    assert preprocessing_instance.num_columns == [
+        "feature_0",
+        "feature_1",
+        "feature_2",
+        "feature_3",
+        "feature_4",
+    ]
+    assert preprocessing_instance.non_correlated_columns == []
+
+
+def test_fit_transform(sample_data, preprocessing_instance, monkeypatch):
+    df, target = sample_data
+
+    # Mock the remove_correlated_columns function
+    monkeypatch.setattr(
+        "bluecast.preprocessing.remove_collinearity.remove_correlated_columns",
+        mock_remove_correlated_columns,
+    )
+
+    transformed_df, transformed_target = preprocessing_instance.fit_transform(
+        df, target
+    )
+
+    # Check if transformed data has the correct shape and type
+    assert isinstance(transformed_df, pd.DataFrame)
+    assert isinstance(transformed_target, pd.Series)
+    assert transformed_df.shape == (100, 4)  # Since one column is removed by mock
+    assert transformed_target.shape == (100,)
+
+    # Check if missing values and infinite values are handled correctly
+    assert not transformed_df.isnull().any().any()
+    assert not np.isinf(transformed_df).any().any()
+
+
+def test_transform(sample_data, preprocessing_instance, monkeypatch):
+    df, target = sample_data
+
+    # Fit-transform first to simulate the normal flow
+    monkeypatch.setattr(
+        "bluecast.preprocessing.remove_collinearity.remove_correlated_columns",
+        mock_remove_correlated_columns,
+    )
+    preprocessing_instance.fit_transform(df, target)
+
+    # Now transform new data
+    new_df = df.copy()
+    new_df.loc[0, "feature_0"] = np.nan  # Introduce missing value
+
+    transformed_df, transformed_target = preprocessing_instance.transform(
+        new_df, target
+    )
+
+    # Check if transformed data has the correct shape and type
+    assert isinstance(transformed_df, pd.DataFrame)
+    assert isinstance(transformed_target, pd.Series)
+    assert transformed_df.shape == (100, 4)
+    assert transformed_target.shape == (100,)
+
+    # Check if missing values and infinite values are handled correctly
+    assert not transformed_df.isnull().any().any()
+    assert not np.isinf(transformed_df).any().any()
+
+
+def test_no_numerical_columns():
+    df = pd.DataFrame({"category": ["A", "B", "C"], "binary": [1, 0, 1]})
+    target = pd.Series([1, 0, 1])
+
+    preprocessing = PreprocessingForLinearModels(num_columns=[])
+    transformed_df, transformed_target = preprocessing.fit_transform(df, target)
+
+    # Since there are no numerical columns, the DataFrame should remain unchanged
+    pd.testing.assert_frame_equal(transformed_df, df)
+    pd.testing.assert_series_equal(transformed_target, target)
diff --git a/dist/bluecast-1.6.0-py3-none-any.whl b/dist/bluecast-1.6.0-py3-none-any.whl
diff --git a/dist/bluecast-1.6.0.tar.gz b/dist/bluecast-1.6.0.tar.gz