diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py index 8b2db786fd..915599f831 100644 --- a/fedot/core/data/cv_folds.py +++ b/fedot/core/data/cv_folds.py @@ -43,7 +43,7 @@ def split(self, data: np.ndarray, *args) -> Iterator[Tuple[InputData, InputData] def cv_generator(data: Union[InputData, MultiModalData], - cv_folds: Optional[int] = None, + cv_folds: int, shuffle: bool = False, random_seed: int = 42, stratify: bool = True, diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 2d2264721c..022003dfe9 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -158,11 +158,8 @@ def test_advanced_time_series_splitting(): @pytest.mark.parametrize('data_splitter, data', - # test StratifiedKFold [(DataSourceSplitter(cv_folds=3, shuffle=True), get_imbalanced_data_to_test_mismatch()), - # test KFold (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), - # test hold-out (DataSourceSplitter(shuffle=True), get_imbalanced_data_to_test_mismatch()), ]) def test_data_splitting_without_shape_mismatch(data_splitter: DataSourceSplitter, data: InputData): @@ -204,11 +201,14 @@ def test_multivariate_time_series_splitting_correct(): assert len(test_series_data.features) == 20 assert np.allclose(test_series_data.target, np.array([16, 17, 18, 19])) + @pytest.mark.parametrize(('datas_funs', 'cv_folds', 'shuffle', 'stratify'), [# classification + stratify + shuffle + cv_folds - ([partial(get_tabular_classification_data, 100, 5)] * 3, 2, True, True), + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, True), # classification + shuffle + cv_folds - ([partial(get_tabular_classification_data, 100, 5)] * 3, 2, True, False), + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, False), + # classification + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, False, False), # classification + stratify + shuffle ([partial(get_tabular_classification_data, 100, 5)] * 3, None, True, True), # classification + shuffle @@ -216,7 +216,7 @@ def test_multivariate_time_series_splitting_correct(): # classification ([partial(get_tabular_classification_data, 100, 5)] * 3, None, False, False), # timeseries + cv_folds - ([partial(get_ts_data_to_forecast, 10, 100)] * 3, 2, False, False), + ([partial(get_ts_data_to_forecast, 10, 100)] * 3, 3, False, False), # timeseries ([partial(get_ts_data_to_forecast, 10, 100)] * 3, None, False, False), ]) @@ -225,6 +225,7 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=stratify) data_producer = data_splitter.build(mdata) keys = tuple(mdata.keys()) + features_dimensity = [subdata.features.shape[1:] for subdata in mdata.values()] for samples in data_producer(): for sample in samples: @@ -237,6 +238,10 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str idx = [np.reshape(x.idx, (-1, 1)) for x in sample.values()] assert np.all(np.diff(np.concatenate(idx, 1), 1) == 0) + # dimensity of features should be the same + splitted_data_features_dimensity =[subdata.features.shape[1:] for subdata in sample.values()] + assert features_dimensity == splitted_data_features_dimensity + # shuffle should be done if shuffle: for key in keys: @@ -248,7 +253,6 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str assert check_stratify(samples[0][key], samples[1][key]) - @pytest.mark.parametrize("cv_generator, data", [(partial(cv_generator, cv_folds=5), get_classification_data()[0]), diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index 6e176ec4bf..052d67d667 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -17,7 +17,6 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum, MetricsRepository, \ RegressionMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum -from fedot.core.validation.cv_folds import tabular_cv_generator, OneFoldInputDataSplit from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_forecasting import get_simple_ts_pipeline from test.unit.validation.test_table_cv import sample_pipeline @@ -75,12 +74,12 @@ def empty_datasource(): ) def test_pipeline_objective_evaluate_with_different_metrics(classification_dataset, pipeline): for metric in ClassificationMetricsEnum: - one_fold_split = OneFoldInputDataSplit() - data_split = partial(one_fold_split.input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) check_pipeline = deepcopy(pipeline) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) fitness = objective_eval(pipeline) - act_fitness = actual_fitness(data_split, check_pipeline, metric) + act_fitness = actual_fitness(data_producer, check_pipeline, metric) assert fitness.valid assert fitness.value is not None assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name @@ -88,11 +87,11 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset): pipeline = empty_pipeline() - - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) metric = ClassificationMetricsEnum.ROCAUC_penalty - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) with pytest.raises(AttributeError): objective_eval(pipeline) @@ -100,10 +99,11 @@ def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset) def test_pipeline_objective_evaluate_with_cv_fold(classification_dataset): pipeline = sample_pipeline() - cv_fold = partial(tabular_cv_generator, classification_dataset, folds=5) + data_producer = DataSourceSplitter(cv_folds=5).build(classification_dataset) metric = ClassificationMetricsEnum.logloss - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), cv_fold) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) fitness = objective_eval(pipeline) assert fitness.valid assert fitness.value is not None @@ -123,16 +123,20 @@ def test_pipeline_objective_evaluate_with_empty_datasource(classification_datase def test_pipeline_objective_evaluate_with_time_constraint(classification_dataset): pipeline = sample_pipeline() - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) metric = ClassificationMetricsEnum.ROCAUC_penalty time_constraint = datetime.timedelta(seconds=0.0001) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split, time_constraint=time_constraint) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer, + time_constraint=time_constraint) fitness = objective_eval(pipeline) assert not fitness.valid time_constraint = datetime.timedelta(seconds=300) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split, time_constraint=time_constraint) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer, + time_constraint=time_constraint) fitness = objective_eval(pipeline) assert fitness.valid assert fitness.value is not None @@ -147,9 +151,9 @@ def test_pipeline_objective_evaluate_with_invalid_metrics(classification_dataset with pytest.raises(Exception): pipeline = sample_pipeline() - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) - - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), data_split) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), + data_producer=data_producer) objective_eval(pipeline) diff --git a/test/unit/validation/test_table_cv.py b/test/unit/validation/test_table_cv.py index 6b4df58eac..7888f0f939 100644 --- a/test/unit/validation/test_table_cv.py +++ b/test/unit/validation/test_table_cv.py @@ -1,18 +1,16 @@ import logging from datetime import timedelta -from functools import partial import pytest + from golem.core.tuning.simultaneous import SimultaneousTuner from sklearn.metrics import roc_auc_score as roc_auc -from sklearn.model_selection import KFold, StratifiedKFold from fedot.api.main import Fedot from fedot.core.composer.composer_builder import ComposerBuilder from fedot.core.data.data import InputData from fedot.core.data.data_split import train_test_data_setup from fedot.core.optimisers.objective import PipelineObjectiveEvaluate -from fedot.core.optimisers.objective.data_objective_advisor import DataObjectiveAdvisor from fedot.core.optimisers.objective.metrics_objective import MetricsObjective from fedot.core.pipelines.node import PipelineNode from fedot.core.pipelines.pipeline import Pipeline @@ -22,7 +20,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.utils import fedot_project_root -from fedot.core.validation.cv_folds import tabular_cv_generator +from fedot.core.optimisers.objective.data_source_splitter import DataSourceSplitter from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_classification import get_iris_data, pipeline_simple @@ -44,32 +42,18 @@ def get_classification_data(): def test_cv_multiple_metrics_evaluated_correct(classification_dataset): pipeline = sample_pipeline() - cv_folds = partial(tabular_cv_generator, classification_dataset, folds=5) + data_producer = DataSourceSplitter(cv_folds=5).build(classification_dataset) metrics = [ClassificationMetricsEnum.ROCAUC_penalty, ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss] - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), cv_folds) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), + data_producer=data_producer) actual_values = objective_eval(pipeline).values all_metrics_correct = all(0 < abs(x) <= 1 for x in actual_values) assert all_metrics_correct -def test_kfold_advisor_works_correct_in_balanced_case(): - data = get_classification_data() - advisor = DataObjectiveAdvisor() - split_type = advisor.propose_kfold(data) - assert split_type == KFold - - -def test_kfold_advisor_works_correct_in_imbalanced_case(): - data = get_classification_data() - data.target[:-int(len(data.target) * 0.1)] = 0 - advisor = DataObjectiveAdvisor() - split_type = advisor.propose_kfold(data) - assert split_type == StratifiedKFold - - def test_cv_min_kfolds_raise(): task = Task(task_type=TaskTypesEnum.classification) models_repo = OperationTypesRepository() diff --git a/test/unit/validation/test_time_series_cv.py b/test/unit/validation/test_time_series_cv.py index e96811e0b5..dac782f062 100644 --- a/test/unit/validation/test_time_series_cv.py +++ b/test/unit/validation/test_time_series_cv.py @@ -15,7 +15,7 @@ from fedot.core.repository.quality_metrics_repository import \ MetricsRepository, RegressionMetricsEnum from fedot.core.repository.tasks import TsForecastingParams -from fedot.core.validation.cv_folds import ts_cv_generator +from fedot.core.data.cv_folds import cv_generator from test.unit.tasks.test_forecasting import get_simple_ts_pipeline, get_ts_data log = default_log(prefix=__name__) @@ -50,7 +50,8 @@ def test_ts_cv_generator_correct(): validation_horizon = validation_elements_per_fold * folds i = 0 - for train_data, test_data in ts_cv_generator(time_series, folds, validation_blocks, log): + for train_data, test_data in cv_generator(time_series, cv_folds=folds, + validation_blocks=validation_blocks): train_len = len(train_data.idx) assert train_len == ts_len - validation_horizon validation_horizon -= validation_elements_per_fold @@ -58,23 +59,6 @@ def test_ts_cv_generator_correct(): assert i == folds -def test_cv_folds_too_large_correct(): - """ Checks whether cases where the number of folds is too large, causing - the number of elements to be validated to be greater than the number of elements - in the time series itself, are adequately handled - - In this case a hold-out validation with 1 fold and 3 validation blocks must be performed - """ - folds = 50 - forecast_len, validation_blocks, time_series = configure_experiment() - - i = 0 - for train_data, test_data in ts_cv_generator(time_series, folds, validation_blocks, log): - i += 1 - assert len(train_data.idx) == 85 - assert i == 1 - - def test_tuner_cv_correct(): """ Checks if the tuner works correctly when using cross validation for