Skip to content

Commit

Permalink
Som eunit tests and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Aug 18, 2024
1 parent 1d40084 commit 60dc32f
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 17 deletions.
34 changes: 19 additions & 15 deletions bluecast/blueprints/preprocessing_recipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

from bluecast.preprocessing.custom import CustomPreprocessing
from bluecast.preprocessing.remove_collinearity import remove_correlated_columns
Expand All @@ -13,7 +13,7 @@ class PreprocessingForLinearModels(CustomPreprocessing):
def __init__(self, num_columns: Optional[List]):
super().__init__()
self.missing_val_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
self.scaler = StandardScaler()
self.scaler = PowerTransformer(method="yeo-johnson")

if isinstance(num_columns, List):
self.num_columns = num_columns
Expand All @@ -24,25 +24,26 @@ def __init__(self, num_columns: Optional[List]):
def fit_transform(
self, df: pd.DataFrame, target: pd.Series
) -> Tuple[pd.DataFrame, pd.Series]:
if len(self.num_columns) == 0:
self.num_columns = df.columns.to_list()

df.loc[:, self.num_columns] = df.loc[:, self.num_columns].replace(
[np.inf, -np.inf], np.nan
)

df.loc[:, self.num_columns] = self.missing_val_imputer.fit_transform(
df.loc[:, self.num_columns]
)
df.loc[:, self.num_columns] = self.scaler.fit_transform(
df.loc[:, self.num_columns]
)
if len(self.num_columns) > 0:
df.loc[:, self.num_columns] = self.missing_val_imputer.fit_transform(
df.loc[:, self.num_columns]
)
df.loc[:, self.num_columns] = self.scaler.fit_transform(
df.loc[:, self.num_columns]
)

df_non_numerical = df.loc[
:, [col for col in df.columns.to_list() if col not in self.num_columns]
]

df = remove_correlated_columns(df.loc[:, self.num_columns], 0.9)
self.non_correlated_columns = remove_correlated_columns(
df.loc[:, self.num_columns], 0.9
).columns.to_list()
df_numerical = df.loc[:, self.non_correlated_columns]
self.non_correlated_columns = df_numerical.columns.to_list()

Expand All @@ -60,10 +61,13 @@ def transform(
[np.inf, -np.inf], np.nan
)

df.loc[:, self.num_columns] = self.missing_val_imputer.transform(
df.loc[:, self.num_columns]
)
df.loc[:, self.num_columns] = self.scaler.transform(df.loc[:, self.num_columns])
if len(self.num_columns) > 0:
df.loc[:, self.num_columns] = self.missing_val_imputer.transform(
df.loc[:, self.num_columns]
)
df.loc[:, self.num_columns] = self.scaler.transform(
df.loc[:, self.num_columns]
)

df_non_numerical = df.loc[
:, [col for col in df.columns.to_list() if col not in self.num_columns]
Expand Down
9 changes: 7 additions & 2 deletions bluecast/preprocessing/remove_collinearity.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

import pandas as pd


Expand All @@ -14,11 +16,14 @@ def remove_correlated_columns(df: pd.DataFrame, threshold: float = 0.9):
corr_matrix = df.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i, j] >= threshold) and (
corr_matrix.columns[j] not in col_corr
if (
(corr_matrix.iloc[i, j] >= threshold)
and (corr_matrix.columns[j] not in col_corr)
and (corr_matrix.columns[i] not in col_corr)
):
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in df.columns:
del df[colname] # deleting the column from the df
logging.info(f"Removed the following collinear columns: {col_corr}")
return df
79 changes: 79 additions & 0 deletions bluecast/tests/test_custom_model_recipes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from bluecast.blueprints.custom_model_recipes import LogisticRegressionModel


@pytest.fixture
def data():
# Generate a synthetic binary classification dataset
X, y = make_classification(
n_samples=100, n_features=20, n_informative=2, n_redundant=10, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

return (
pd.DataFrame(X_train),
pd.DataFrame(X_test),
pd.Series(y_train),
pd.Series(y_test),
)


@pytest.fixture
def model():
return LogisticRegressionModel(random_state=42)


def test_autotune(data, model):
X_train, X_test, y_train, y_test = data
model.autotune(X_train, X_test, y_train, y_test)

# Check if the model has been fitted by inspecting the attributes
assert hasattr(
model.model, "coef_"
), "Model should have been fitted and have coefficients."


def test_fit(data, model):
X_train, X_test, y_train, y_test = data
model.fit(X_train, X_test, y_train, y_test)

# Again, check if the model has been fitted
assert hasattr(
model.model, "coef_"
), "Model should have been fitted after calling fit method."


def test_predict(data, model):
X_train, X_test, y_train, y_test = data
model.fit(X_train, X_test, y_train, y_test)

probas, classes = model.predict(X_test)

# Check the types of the returned values
assert isinstance(
probas, np.ndarray
), "Predicted probabilities should be a numpy array."
assert isinstance(classes, np.ndarray), "Predicted classes should be a numpy array."

# Check the shape of the returned values
assert probas.shape == (
X_test.shape[0],
), "Predicted probabilities should have the correct shape."
assert classes.shape == (
X_test.shape[0],
), "Predicted classes should have the correct shape."

# Check if values are within the expected range
assert np.all(
(probas >= 0) & (probas <= 1)
), "Predicted probabilities should be between 0 and 1."
assert np.all(
(classes == 0) | (classes == 1)
), "Predicted classes should be either 0 or 1."
107 changes: 107 additions & 0 deletions bluecast/tests/test_preprocessing_recipes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import make_classification

from bluecast.blueprints.preprocessing_recipes import PreprocessingForLinearModels
from bluecast.preprocessing.custom import CustomPreprocessing


# Mocking remove_correlated_columns for testing purposes
def mock_remove_correlated_columns(df, threshold):
return df.loc[:, df.columns[:-1]] # Just drop the last column for simplicity

Check warning on line 12 in bluecast/tests/test_preprocessing_recipes.py

View check run for this annotation

Codecov / codecov/patch

bluecast/tests/test_preprocessing_recipes.py#L12

Added line #L12 was not covered by tests


@pytest.fixture
def sample_data():
# Create a sample DataFrame with numerical data
X, y = make_classification(n_samples=100, n_features=5, random_state=42)
df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(5)])
target = pd.Series(y, name="target")
return df, target


@pytest.fixture
def preprocessing_instance():
# Create an instance of PreprocessingForLinearModels
return PreprocessingForLinearModels(
num_columns=["feature_0", "feature_1", "feature_2", "feature_3", "feature_4"]
)


def test_initialization(preprocessing_instance):
# Test if the class initializes correctly
assert isinstance(preprocessing_instance, CustomPreprocessing)
assert preprocessing_instance.num_columns == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
"feature_4",
]
assert preprocessing_instance.non_correlated_columns == []


def test_fit_transform(sample_data, preprocessing_instance, monkeypatch):
df, target = sample_data

# Mock the remove_correlated_columns function
monkeypatch.setattr(
"bluecast.preprocessing.remove_collinearity.remove_correlated_columns",
mock_remove_correlated_columns,
)

transformed_df, transformed_target = preprocessing_instance.fit_transform(
df, target
)

# Check if transformed data has the correct shape and type
assert isinstance(transformed_df, pd.DataFrame)
assert isinstance(transformed_target, pd.Series)
assert transformed_df.shape == (100, 4) # Since one column is removed by mock
assert transformed_target.shape == (100,)

# Check if missing values and infinite values are handled correctly
assert not transformed_df.isnull().any().any()
assert not np.isinf(transformed_df).any().any()


def test_transform(sample_data, preprocessing_instance, monkeypatch):
df, target = sample_data

# Fit-transform first to simulate the normal flow
monkeypatch.setattr(
"bluecast.preprocessing.remove_collinearity.remove_correlated_columns",
mock_remove_correlated_columns,
)
preprocessing_instance.fit_transform(df, target)

# Now transform new data
new_df = df.copy()
new_df.loc[0, "feature_0"] = np.nan # Introduce missing value

transformed_df, transformed_target = preprocessing_instance.transform(
new_df, target
)

# Check if transformed data has the correct shape and type
assert isinstance(transformed_df, pd.DataFrame)
assert isinstance(transformed_target, pd.Series)
assert transformed_df.shape == (100, 4)
assert transformed_target.shape == (100,)

# Check if missing values and infinite values are handled correctly
assert not transformed_df.isnull().any().any()
assert not np.isinf(transformed_df).any().any()


def test_no_numerical_columns():
df = pd.DataFrame({"category": ["A", "B", "C"], "binary": [1, 0, 1]})
target = pd.Series([1, 0, 1])

preprocessing = PreprocessingForLinearModels(num_columns=[])
transformed_df, transformed_target = preprocessing.fit_transform(df, target)

# Since there are no numerical columns, the DataFrame should remain unchanged
pd.testing.assert_frame_equal(transformed_df, df)
pd.testing.assert_series_equal(transformed_target, target)
Binary file modified dist/bluecast-1.6.0-py3-none-any.whl
Binary file not shown.
Binary file modified dist/bluecast-1.6.0.tar.gz
Binary file not shown.

0 comments on commit 60dc32f

Please sign in to comment.