Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Group split kfold #484

Merged
merged 9 commits into from
Jan 14, 2025
150 changes: 124 additions & 26 deletions bofire/surrogates/trainable.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import warnings
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit, KFold, StratifiedKFold

from bofire.data_models.enum import OutputFilteringEnum
from bofire.data_models.features.api import (
Expand All @@ -21,6 +22,13 @@ class TrainableSurrogate(ABC):
_output_filtering: OutputFilteringEnum = OutputFilteringEnum.ALL

def fit(self, experiments: pd.DataFrame, options: Optional[Dict] = None):
"""
Fit the surrogate model to the provided experiments.

Args:
experiments (pd.DataFrame): The experimental data to fit the model.
options (Optional[Dict], optional): Additional options for fitting the model. Defaults to None.
"""
# validate
experiments = self.inputs.validate_experiments(experiments, strict=False) # type: ignore
experiments = self.outputs.validate_experiments(experiments) # type: ignore
Expand All @@ -34,6 +42,15 @@ def fit(self, experiments: pd.DataFrame, options: Optional[Dict] = None):
self._fit(X=X, Y=Y, **options)

def _preprocess_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:
"""
Preprocess the experiments based on the output filtering setting.

Args:
experiments (pd.DataFrame): The experimental data to preprocess.

Returns:
pd.DataFrame: The preprocessed experimental data.
"""
if self._output_filtering is None:
return experiments
if self._output_filtering == OutputFilteringEnum.ALL:
Expand All @@ -50,6 +67,13 @@ def _preprocess_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:

@abstractmethod
def _fit(self, X: pd.DataFrame, Y: pd.DataFrame, **kwargs):
"""
Abstract method to fit the model to the provided data.

Args:
X (pd.DataFrame): The input features.
Y (pd.DataFrame): The output targets.
"""
pass

def cross_validate(
Expand All @@ -60,6 +84,7 @@ def cross_validate(
include_labcodes: bool = False,
random_state: Optional[int] = None,
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
hooks: Optional[
Dict[
str,
Expand Down Expand Up @@ -88,6 +113,11 @@ def cross_validate(
Defaults to None.
stratified_feature (str, optional): The feature name to preserve the percentage of samples for each class in
the stratified folds. Defaults to None.
group_split_column (str, optional): The column name of the group id.
This parameter is used to ensure that the splits are made such that the same group is not present in both
training and testing sets. This is useful in scenarios where data points are related or dependent on each
other, and splitting them into different sets would violate the assumption of independence. The number of
unique groups must be greater than or equal to the number of folds. Defaults to None.
hooks (Dict[str, Callable[[Model, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], Any]], optional):
Dictionary of callable hooks that are called within the CV loop. The callable retrieves the current trained
modeld and the current CV folds in the following order: X_train, y_train, X_test, y_test. Defaults to {}.
Expand Down Expand Up @@ -126,20 +156,23 @@ def cross_validate(
"The feature to be stratified needs to be a DiscreteInput, CategoricalInput, CategoricalOutput, or ContinuousOutput",
)

if group_split_column is not None:
# check if the group split column is present in the experiments
if group_split_column not in experiments.columns:
raise ValueError(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also test these errors, so that they are throw correctly?

f"Group split column {group_split_column} is not present in the experiments."
)
ngroups = len(experiments[group_split_column].unique())
# check if the number of unique groups is greater than or equal to the number of folds
if ngroups < folds:
raise ValueError(
f"Number of unique groups {ngroups} is less than the number of folds {folds}."
)

# first filter the experiments based on the model setting
experiments = self._preprocess_experiments(experiments)
n = len(experiments)
if folds > n:
warnings.warn(
f"Training data only has {n} experiments, which is less than folds, fallback to LOOCV.",
)
folds = n
elif n == 0:
raise ValueError("Experiments is empty.")
elif folds < 2 and folds != -1:
raise ValueError("Folds must be -1 for LOO, or > 1.")
elif folds == -1:
folds = n
folds = self._check_valid_nfolds(folds, n)
# preprocess hooks
if hooks is None:
hooks = {}
Expand All @@ -148,19 +181,13 @@ def cross_validate(
hook_results = {key: [] for key in hooks.keys()}

# instantiate kfold object
if stratified_feature is None:
cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)
cv_func = cv.split(experiments)
else:
cv = StratifiedKFold(
n_splits=folds,
shuffle=True,
random_state=random_state,
)
cv_func = cv.split(
experiments.drop([stratified_feature], axis=1),
experiments[stratified_feature],
)
cv, cv_func = self._make_cv_split(
experiments,
folds,
stratified_feature=stratified_feature,
group_split_column=group_split_column,
random_state=random_state,
)

key = self.outputs.get_keys()[0] # type: ignore
train_results = []
Expand Down Expand Up @@ -239,3 +266,74 @@ def cross_validate(
CvResults(results=test_results),
hook_results,
)

def _check_valid_nfolds(self, folds, n):
"""
Check and adjust the number of folds for cross-validation.

Args:
folds (int): The requested number of folds.
n (int): The number of experiments.

Returns:
int: The adjusted number of folds.

Raises:
ValueError: If the number of folds is invalid or if the experiments are empty.
"""
if n == 0:
raise ValueError("Experiments is empty.")
if folds > n:
warnings.warn(
f"Training data only has {n} experiments, which is less than folds, fallback to LOOCV.",
)
folds = n
elif folds < 2 and folds != -1:
raise ValueError("Folds must be -1 for LOO, or > 1.")
elif folds == -1:
folds = n
return folds

def _make_cv_split(
self,
experiments: pd.DataFrame,
folds: int,
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
random_state: Optional[int] = None,
) -> Tuple[
Union[KFold, StratifiedKFold, GroupShuffleSplit],
Generator[Tuple[np.ndarray, np.ndarray], None, None],
]:
"""
Create the cross-validation split object.

Args:
experiments (pd.DataFrame): The experimental data.
folds (int): The number of folds.
random_state (Optional[int]): The random state for reproducibility.
stratified_feature (Optional[str]): The feature to stratify by.
group_split_column (Optional[str]): The column for group splitting.

Returns:
Tuple: The cross-validation split object and the split function.
"""
if stratified_feature is None:
if group_split_column is not None:
# GROUP SPLIT FUNCTIONALITY
cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
cv_func = cv.split(experiments, groups=experiments[group_split_column])
else:
cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)
cv_func = cv.split(experiments)
else:
cv = StratifiedKFold(
n_splits=folds,
shuffle=True,
random_state=random_state,
)
cv_func = cv.split(
experiments.drop([stratified_feature], axis=1),
experiments[stratified_feature],
)
return cv, cv_func
Loading
Loading