Skip to content

Commit

Permalink
Add GroupLevelAggFeatures feature generator
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Aug 22, 2024
1 parent dce9510 commit 0e9cc83
Show file tree
Hide file tree
Showing 2 changed files with 204 additions and 0 deletions.
69 changes: 69 additions & 0 deletions bluecast/preprocessing/feature_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
import polars as pl
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

Expand Down Expand Up @@ -115,6 +116,74 @@ def add_row_level_agg_features(
return df


class GroupLevelAggFeatures:
def __init__(self):
self.original_features: List[Union[str, int, float]] = []
self.agg_features_created: List[Union[str, int, float]] = []

def create_groupby_agg_features(
self,
df: Union[pd.DataFrame, pl.DataFrame],
groupby_columns: List[str],
columns_to_agg: Optional[List[str]],
target_col: Optional[str],
aggregations: Optional[List[str]] = None,
) -> pd.DataFrame:
"""
Create aggregations based on groups for a given DataFrame.
:param df: Either Pandas or Polars DataFrame.
:param groupby_columns: List of column names to use for the groupby.
:param columns_to_agg: List of columns to aggregate. If empty all columns except
target column (target_col) will be chosen.
:param target_col: Target column name. Will be ignored during aggregation.
:param aggregations: Aggregations to perform. If not provided, ["min", "max", "mean", "sum"] will be used.
:return:
"""
if not isinstance(aggregations, list):
aggregations = ["min", "max", "mean", "sum"]

if isinstance(df, pd.DataFrame):
df = pl.from_dataframe(df)

self.original_features = df.columns

# Determine which columns to aggregate
if not columns_to_agg:
columns_to_agg = df.columns

# Remove the target column from the aggregation list if specified
if isinstance(columns_to_agg, list):
if target_col in columns_to_agg:
columns_to_agg.remove(target_col)

# Define the aggregation operations
agg_ops = []
if isinstance(columns_to_agg, list):
for col in columns_to_agg:
for agg in aggregations:
agg_ops.append(getattr(pl.col(col), agg)().alias(f"{col}_{agg}"))
self.agg_features_created.append(f"{col}_{agg}")

df_grouped = df.group_by(groupby_columns).agg(agg_ops)

# Optionally add the target column back to the final DataFrame
if target_col in self.original_features:
df_grouped = df_grouped.join(
df.select(groupby_columns + [target_col]),
on=groupby_columns,
how="left",
)
else:
df_grouped = df_grouped.join(
df.select(groupby_columns),
on=groupby_columns,
how="left",
)

return df_grouped.to_pandas()


class FeatureClusteringScorer:
def __init__(
self,
Expand Down
135 changes: 135 additions & 0 deletions bluecast/tests/test_feature_creation.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from collections import namedtuple

import pandas as pd
import polars as pl
import pytest

from bluecast.preprocessing.feature_creation import (
AddRowLevelAggFeatures,
FeatureClusteringScorer,
GroupLevelAggFeatures,
)


Expand Down Expand Up @@ -229,3 +231,136 @@ def test_changing_features(synthetic_data):
assert cluster_results.head(1)["loyalty"].values[0] == 1
assert cluster_results.tail(1)["loyalty"].values[0] == 4
assert cluster_results["total_score"].max() == 11


# test grouplevelaggfeature creator
@pytest.fixture
def polars_dataframe(sample_dataframe):
return pl.from_pandas(sample_dataframe)


def test_initialization_grouplevelaggfeatures():
aggregator = GroupLevelAggFeatures()
assert aggregator.original_features == []
assert aggregator.agg_features_created == []


def test_create_groupby_agg_features_pandas(sample_dataframe):
aggregator = GroupLevelAggFeatures()
groupby_columns = ["target"]
columns_to_agg = ["A", "B"]
target_col = "C"
aggregations = ["mean", "sum"]

result_df = aggregator.create_groupby_agg_features(
sample_dataframe, groupby_columns, columns_to_agg, target_col, aggregations
)

expected_columns = ["target", "A_mean", "A_sum", "B_mean", "B_sum", "C"]
assert list(result_df.columns) == expected_columns
assert aggregator.original_features == ["A", "B", "C", "target"]
assert aggregator.agg_features_created == ["A_mean", "A_sum", "B_mean", "B_sum"]


def test_create_groupby_agg_features_polars(polars_dataframe):
aggregator = GroupLevelAggFeatures()
groupby_columns = ["target"]
columns_to_agg = ["A", "B"]
target_col = "C"
aggregations = ["min", "max"]

result_df = aggregator.create_groupby_agg_features(
polars_dataframe, groupby_columns, columns_to_agg, target_col, aggregations
)

expected_columns = ["target", "A_min", "A_max", "B_min", "B_max", "C"]
assert list(result_df.columns) == expected_columns
assert aggregator.original_features == ["A", "B", "C", "target"]
assert aggregator.agg_features_created == ["A_min", "A_max", "B_min", "B_max"]


def test_create_groupby_agg_features_no_columns_to_agg(sample_dataframe):
aggregator = GroupLevelAggFeatures()
groupby_columns = ["target"]
columns_to_agg = None
target_col = "C"

result_df = aggregator.create_groupby_agg_features(
sample_dataframe, groupby_columns, columns_to_agg, target_col
)

expected_columns = [
"target",
"A_min",
"A_max",
"A_mean",
"A_sum",
"B_min",
"B_max",
"B_mean",
"B_sum",
"C",
]
assert list(result_df.columns) == expected_columns
assert aggregator.original_features == ["A", "B", "C", "target"]
assert aggregator.agg_features_created == [
"A_min",
"A_max",
"A_mean",
"A_sum",
"B_min",
"B_max",
"B_mean",
"B_sum",
]


def test_create_groupby_agg_features_no_target_column(sample_dataframe):
aggregator = GroupLevelAggFeatures()
groupby_columns = ["target"]
columns_to_agg = ["A", "B"]
target_col = None

result_df = aggregator.create_groupby_agg_features(
sample_dataframe, groupby_columns, columns_to_agg, target_col
)

expected_columns = [
"target",
"A_min",
"A_max",
"A_mean",
"A_sum",
"B_min",
"B_max",
"B_mean",
"B_sum",
]
assert list(result_df.columns) == expected_columns
assert aggregator.original_features == ["A", "B", "C", "target"]
assert aggregator.agg_features_created == [
"A_min",
"A_max",
"A_mean",
"A_sum",
"B_min",
"B_max",
"B_mean",
"B_sum",
]


def test_create_groupby_agg_features_default_aggregations(sample_dataframe):
aggregator = GroupLevelAggFeatures()
groupby_columns = ["target"]
columns_to_agg = ["A"]
target_col = None

result_df = aggregator.create_groupby_agg_features(
sample_dataframe, groupby_columns, columns_to_agg, target_col
)

expected_columns = ["target", "A_min", "A_max", "A_mean", "A_sum"]
assert list(result_df.columns) == expected_columns
assert aggregator.original_features == ["A", "B", "C", "target"]
assert aggregator.agg_features_created == ["A_min", "A_max", "A_mean", "A_sum"]

0 comments on commit 0e9cc83

Please sign in to comment.