Add GroupLevelAggFeatures feature generator

ThomasMeissnerDS · Aug 22, 2024 · 0e9cc83 · 0e9cc83
1 parent dce9510
commit 0e9cc83
Show file tree

Hide file tree

Showing 2 changed files with 204 additions and 0 deletions.
diff --git a/bluecast/preprocessing/feature_creation.py b/bluecast/preprocessing/feature_creation.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import MinMaxScaler
 
@@ -115,6 +116,74 @@ def add_row_level_agg_features(
         return df
 
 
+class GroupLevelAggFeatures:
+    def __init__(self):
+        self.original_features: List[Union[str, int, float]] = []
+        self.agg_features_created: List[Union[str, int, float]] = []
+
+    def create_groupby_agg_features(
+        self,
+        df: Union[pd.DataFrame, pl.DataFrame],
+        groupby_columns: List[str],
+        columns_to_agg: Optional[List[str]],
+        target_col: Optional[str],
+        aggregations: Optional[List[str]] = None,
+    ) -> pd.DataFrame:
+        """
+        Create aggregations based on groups for a given DataFrame.
+
+        :param df: Either Pandas or Polars DataFrame.
+        :param groupby_columns: List of column names to use for the groupby.
+        :param columns_to_agg: List of columns to aggregate. If empty all columns except
+            target column (target_col) will be chosen.
+        :param target_col: Target column name. Will be ignored during aggregation.
+        :param aggregations: Aggregations to perform. If not provided, ["min", "max", "mean", "sum"] will be used.
+        :return:
+        """
+        if not isinstance(aggregations, list):
+            aggregations = ["min", "max", "mean", "sum"]
+
+        if isinstance(df, pd.DataFrame):
+            df = pl.from_dataframe(df)
+
+        self.original_features = df.columns
+
+        # Determine which columns to aggregate
+        if not columns_to_agg:
+            columns_to_agg = df.columns
+
+        # Remove the target column from the aggregation list if specified
+        if isinstance(columns_to_agg, list):
+            if target_col in columns_to_agg:
+                columns_to_agg.remove(target_col)
+
+        # Define the aggregation operations
+        agg_ops = []
+        if isinstance(columns_to_agg, list):
+            for col in columns_to_agg:
+                for agg in aggregations:
+                    agg_ops.append(getattr(pl.col(col), agg)().alias(f"{col}_{agg}"))
+                    self.agg_features_created.append(f"{col}_{agg}")
+
+        df_grouped = df.group_by(groupby_columns).agg(agg_ops)
+
+        # Optionally add the target column back to the final DataFrame
+        if target_col in self.original_features:
+            df_grouped = df_grouped.join(
+                df.select(groupby_columns + [target_col]),
+                on=groupby_columns,
+                how="left",
+            )
+        else:
+            df_grouped = df_grouped.join(
+                df.select(groupby_columns),
+                on=groupby_columns,
+                how="left",
+            )
+
+        return df_grouped.to_pandas()
+
+
 class FeatureClusteringScorer:
     def __init__(
         self,

diff --git a/bluecast/tests/test_feature_creation.py b/bluecast/tests/test_feature_creation.py
@@ -1,11 +1,13 @@
 from collections import namedtuple
 
 import pandas as pd
+import polars as pl
 import pytest
 
 from bluecast.preprocessing.feature_creation import (
     AddRowLevelAggFeatures,
     FeatureClusteringScorer,
+    GroupLevelAggFeatures,
 )
 
 
@@ -229,3 +231,136 @@ def test_changing_features(synthetic_data):
     assert cluster_results.head(1)["loyalty"].values[0] == 1
     assert cluster_results.tail(1)["loyalty"].values[0] == 4
     assert cluster_results["total_score"].max() == 11
+
+
+# test grouplevelaggfeature creator
+@pytest.fixture
+def polars_dataframe(sample_dataframe):
+    return pl.from_pandas(sample_dataframe)
+
+
+def test_initialization_grouplevelaggfeatures():
+    aggregator = GroupLevelAggFeatures()
+    assert aggregator.original_features == []
+    assert aggregator.agg_features_created == []
+
+
+def test_create_groupby_agg_features_pandas(sample_dataframe):
+    aggregator = GroupLevelAggFeatures()
+    groupby_columns = ["target"]
+    columns_to_agg = ["A", "B"]
+    target_col = "C"
+    aggregations = ["mean", "sum"]
+
+    result_df = aggregator.create_groupby_agg_features(
+        sample_dataframe, groupby_columns, columns_to_agg, target_col, aggregations
+    )
+
+    expected_columns = ["target", "A_mean", "A_sum", "B_mean", "B_sum", "C"]
+    assert list(result_df.columns) == expected_columns
+    assert aggregator.original_features == ["A", "B", "C", "target"]
+    assert aggregator.agg_features_created == ["A_mean", "A_sum", "B_mean", "B_sum"]
+
+
+def test_create_groupby_agg_features_polars(polars_dataframe):
+    aggregator = GroupLevelAggFeatures()
+    groupby_columns = ["target"]
+    columns_to_agg = ["A", "B"]
+    target_col = "C"
+    aggregations = ["min", "max"]
+
+    result_df = aggregator.create_groupby_agg_features(
+        polars_dataframe, groupby_columns, columns_to_agg, target_col, aggregations
+    )
+
+    expected_columns = ["target", "A_min", "A_max", "B_min", "B_max", "C"]
+    assert list(result_df.columns) == expected_columns
+    assert aggregator.original_features == ["A", "B", "C", "target"]
+    assert aggregator.agg_features_created == ["A_min", "A_max", "B_min", "B_max"]
+
+
+def test_create_groupby_agg_features_no_columns_to_agg(sample_dataframe):
+    aggregator = GroupLevelAggFeatures()
+    groupby_columns = ["target"]
+    columns_to_agg = None
+    target_col = "C"
+
+    result_df = aggregator.create_groupby_agg_features(
+        sample_dataframe, groupby_columns, columns_to_agg, target_col
+    )
+
+    expected_columns = [
+        "target",
+        "A_min",
+        "A_max",
+        "A_mean",
+        "A_sum",
+        "B_min",
+        "B_max",
+        "B_mean",
+        "B_sum",
+        "C",
+    ]
+    assert list(result_df.columns) == expected_columns
+    assert aggregator.original_features == ["A", "B", "C", "target"]
+    assert aggregator.agg_features_created == [
+        "A_min",
+        "A_max",
+        "A_mean",
+        "A_sum",
+        "B_min",
+        "B_max",
+        "B_mean",
+        "B_sum",
+    ]
+
+
+def test_create_groupby_agg_features_no_target_column(sample_dataframe):
+    aggregator = GroupLevelAggFeatures()
+    groupby_columns = ["target"]
+    columns_to_agg = ["A", "B"]
+    target_col = None
+
+    result_df = aggregator.create_groupby_agg_features(
+        sample_dataframe, groupby_columns, columns_to_agg, target_col
+    )
+
+    expected_columns = [
+        "target",
+        "A_min",
+        "A_max",
+        "A_mean",
+        "A_sum",
+        "B_min",
+        "B_max",
+        "B_mean",
+        "B_sum",
+    ]
+    assert list(result_df.columns) == expected_columns
+    assert aggregator.original_features == ["A", "B", "C", "target"]
+    assert aggregator.agg_features_created == [
+        "A_min",
+        "A_max",
+        "A_mean",
+        "A_sum",
+        "B_min",
+        "B_max",
+        "B_mean",
+        "B_sum",
+    ]
+
+
+def test_create_groupby_agg_features_default_aggregations(sample_dataframe):
+    aggregator = GroupLevelAggFeatures()
+    groupby_columns = ["target"]
+    columns_to_agg = ["A"]
+    target_col = None
+
+    result_df = aggregator.create_groupby_agg_features(
+        sample_dataframe, groupby_columns, columns_to_agg, target_col
+    )
+
+    expected_columns = ["target", "A_min", "A_max", "A_mean", "A_sum"]
+    assert list(result_df.columns) == expected_columns
+    assert aggregator.original_features == ["A", "B", "C", "target"]
+    assert aggregator.agg_features_created == ["A_min", "A_max", "A_mean", "A_sum"]