diff --git a/bluecast/eda/analyse.py b/bluecast/eda/analyse.py index 3d67c673..1a56d87d 100644 --- a/bluecast/eda/analyse.py +++ b/bluecast/eda/analyse.py @@ -8,6 +8,7 @@ import pandas as pd import scipy.stats as ss import seaborn as sns +import statsmodels.api as sm from sklearn.decomposition import PCA from sklearn.feature_selection import mutual_info_classif, mutual_info_regression from sklearn.manifold import TSNE @@ -18,7 +19,7 @@ plt.set_loglevel("WARNING") -def find_nbins_with_freedman_diaconis(data: np.ndarray): +def find_bind_with_with_freedman_diaconis(data: np.ndarray): # Calculate the IQR iqr = np.percentile(data, 75) - np.percentile(data, 25) @@ -228,7 +229,7 @@ def univariate_plots(df: pd.DataFrame, col_requires_at_least_n_values: int = 5) np.arange( min(df[col]), max(df[col]), - max(find_nbins_with_freedman_diaconis(df[col].values), 0.1), + max(find_bind_with_with_freedman_diaconis(df[col].values), 0.1), ) ) @@ -377,6 +378,83 @@ def correlation_to_target(df: pd.DataFrame, target: str) -> None: plt.show() +def plot_against_target_for_regression( + df: pd.DataFrame, num_columns: List[Union[int, float, str]], target_col: str +) -> None: + """ + Creates scatter plots for each column in num_columns against the target_col. + Draws a regression line and shows the p-value for the regression line. + + Parameters: + - df: pd.DataFrame -> The input dataframe containing the data. + - num_columns: List[Union[int, float, str]] -> List of column names to plot against the target column. + - target_col: str -> The target column name for regression. + + Returns: + - None -> The function displays plots. + """ + + if target_col not in df.columns: + raise ValueError( + f"Target column '{target_col}' must be part of the provided DataFrame" + ) + + num_cols_grid = 2 # Set the number of columns for the grid layout + num_variables = len(num_columns) + num_rows = (num_variables + num_cols_grid - 1) // num_cols_grid + + # Set the size of the figure + fig, axes = plt.subplots( + num_rows, num_cols_grid, figsize=(14, 5 * num_rows), squeeze=False + ) + + for i, column in enumerate(num_columns): + if column not in df.columns: + raise ValueError(f"Column '{column}' not found in DataFrame") + + row = i // num_cols_grid + col = i % num_cols_grid + ax = axes[row, col] + + x = df[column] + y = df[target_col] + + # Scatter plot + sns.scatterplot(x=x, y=y, ax=ax) + + # Fit a regression line + X = sm.add_constant(x) # Adds a constant term to the predictor + model = sm.OLS(y, X).fit() + prediction = model.predict(X) + + # Plot the regression line + ax.plot(x, prediction, color="red", label="Regression Line") + + # Calculate and show the p-value + p_value = model.pvalues[1] + ax.annotate( + f"p-value: {p_value:.4f}", + xy=(0.05, 0.95), + xycoords="axes fraction", + fontsize=12, + verticalalignment="top", + bbox=dict(boxstyle="round,pad=0.3", edgecolor="black", facecolor="white"), + ) + + ax.set_title(f"Scatter Plot: {column} vs {target_col}") + ax.set_xlabel(column) + ax.set_ylabel(target_col) + + # Remove any empty subplots + if num_variables < num_rows * num_cols_grid: + for i in range(num_variables, num_rows * num_cols_grid): + fig.delaxes(axes.flatten()[i]) + + # Adjust the spacing between subplots + plt.tight_layout() + plt.show() + + def plot_pca(df: pd.DataFrame, target: str, scale_data: bool = True) -> None: """ Plots PCA for the dataframe. The target column must be part of the provided DataFrame. @@ -814,7 +892,7 @@ def plot_ecdf( np.arange( min(df[col]), max(df[col]), - max(find_nbins_with_freedman_diaconis(df[col].values), 0.1), + max(find_bind_with_with_freedman_diaconis(df[col].values), 0.1), ) ) fig, ax1 = plt.subplots() diff --git a/bluecast/tests/test_analyse.py b/bluecast/tests/test_analyse.py index d53b6503..abfc40e0 100644 --- a/bluecast/tests/test_analyse.py +++ b/bluecast/tests/test_analyse.py @@ -11,6 +11,7 @@ correlation_heatmap, correlation_to_target, mutual_info_to_target, + plot_against_target_for_regression, plot_andrews_curve, plot_classification_target_distribution_within_categories, plot_count_pairs, @@ -394,3 +395,11 @@ def test_plot_andrews_curve_missing_target(sample_dataframe): target_col = "NonExistentTarget" with pytest.raises(KeyError): plot_andrews_curve(sample_dataframe, target_col) + + +def test_plot_against_target_for_regression(synthetic_train_test_data_regression): + num_columns = ["numerical_feature_1", "numerical_feature_2", "numerical_feature_3"] + plot_against_target_for_regression( + synthetic_train_test_data_regression[0], num_columns, "target" + ) + assert True diff --git a/dist/bluecast-1.6.0-py3-none-any.whl b/dist/bluecast-1.6.0-py3-none-any.whl index f0010225..54b12206 100644 Binary files a/dist/bluecast-1.6.0-py3-none-any.whl and b/dist/bluecast-1.6.0-py3-none-any.whl differ diff --git a/dist/bluecast-1.6.0.tar.gz b/dist/bluecast-1.6.0.tar.gz index c8c952a2..003a0f43 100644 Binary files a/dist/bluecast-1.6.0.tar.gz and b/dist/bluecast-1.6.0.tar.gz differ