From 5efd0fb2713030c6384426b06b50585348266a27 Mon Sep 17 00:00:00 2001 From: Sergey Date: Sat, 27 Jan 2024 22:54:19 +0300 Subject: [PATCH] Fast topological features (#1252) * add fast topo * fix fast topo * Automated autopep8 fixes * pep8 * add to initial assumption * make code more clear in `fit` method * add `fast_topoligical_features` to docs * fix aligment for pipeline builders in `TSForecastingAssumptions` * add topo to `FedotBuilder` docs * fix table in docs * add params to fast_topo * change params and add it to tuner search space * fix integration tests * add new param stride * fix param * dirty speedup * delete ica from initial assumption due to instability * fix documentation * fix test * delete fast_topo from assumption and delete fast_train tag --------- Co-authored-by: github-actions[bot] --- .../fedot_features/automation_features.rst | 6 +- .../api_utils/assumptions/task_assumptions.py | 3 +- fedot/api/builder.py | 2 + .../evaluation/common_preprocessing.py | 6 +- .../topological/fast_topological_extractor.py | 59 +++++++++++++++++++ fedot/core/pipelines/tuning/search_space.py | 16 +++++ .../data/data_operation_repository.json | 14 +++++ .../data/default_operation_params.json | 7 +++ test/integration/models/test_model.py | 14 +++-- 9 files changed, 118 insertions(+), 9 deletions(-) create mode 100644 fedot/core/operations/evaluation/operation_implementations/data_operations/topological/fast_topological_extractor.py diff --git a/docs/source/introduction/fedot_features/automation_features.rst b/docs/source/introduction/fedot_features/automation_features.rst index 14e20ac644..88489b0028 100644 --- a/docs/source/introduction/fedot_features/automation_features.rst +++ b/docs/source/introduction/fedot_features/automation_features.rst @@ -69,7 +69,8 @@ FEDOT supports bunch of dimensionality preprocessing operations that can be be a `one_hot_encoding`,One-Hot Encoder, Feature encoding `label_encoding`,Label Encoder, Feature encoding `resample`,Imbalanced binary class transformation in classification, Data transformation - `topological_features`,Calculation of topological features, only for time series,Data transformation + `topological_features`,Calculation of topological features,Time series transformation + `fast_topological_features`,Fast calculation of part of topological features,Time series transformation .. csv-table:: Feature transformation operations implementations @@ -105,7 +106,8 @@ FEDOT supports bunch of dimensionality preprocessing operations that can be be a `one_hot_encoding`,`sklearn.preprocessing.OneHotEncoder`, `label_encoding`,`sklearn.preprocessing.LabelEncoder`,`fast_train` `*tree` `resample`,`FEDOT model using sklearn.utils.resample`, - `topological_features`,FEDOT model,`ts` + `topological_features`,FEDOT model,`ts`, + `fast_topological_features`,FEDOT model,`ts` Models used diff --git a/fedot/api/api_utils/assumptions/task_assumptions.py b/fedot/api/api_utils/assumptions/task_assumptions.py index 8afa713cd1..8090436639 100644 --- a/fedot/api/api_utils/assumptions/task_assumptions.py +++ b/fedot/api/api_utils/assumptions/task_assumptions.py @@ -52,7 +52,8 @@ class TSForecastingAssumptions(TaskAssumptions): def builders(self): return { 'lagged_ridge': - PipelineBuilder().add_sequence('lagged', 'ridge'), + PipelineBuilder() + .add_sequence('lagged', 'ridge'), 'topological': PipelineBuilder() .add_node('lagged') diff --git a/fedot/api/builder.py b/fedot/api/builder.py index fe2c2472c1..90fd81ceed 100644 --- a/fedot/api/builder.py +++ b/fedot/api/builder.py @@ -329,6 +329,8 @@ def setup_pipeline_structure( - ``diff_filter`` -> Derivative Filter Transformation - ``cut`` -> Cut Transformation - ``exog_ts`` -> Exogeneus Transformation + - ``topological_features`` -> Topological features + - ``fast_topological_features`` -> Fast implementation of topological features max_depth: max depth of a pipeline. Defaults to ``6``. diff --git a/fedot/core/operations/evaluation/common_preprocessing.py b/fedot/core/operations/evaluation/common_preprocessing.py index 3a6424efb0..5f9ba45906 100644 --- a/fedot/core/operations/evaluation/common_preprocessing.py +++ b/fedot/core/operations/evaluation/common_preprocessing.py @@ -8,6 +8,9 @@ from fedot.core.operations.evaluation.operation_implementations.data_operations.sklearn_transformations import \ ImputationImplementation, KernelPCAImplementation, NormalizationImplementation, PCAImplementation, \ PolyFeaturesImplementation, ScalingImplementation, FastICAImplementation +from fedot.core.operations.evaluation.operation_implementations.\ + data_operations.topological.fast_topological_extractor import \ + FastTopologicalFeaturesImplementation from fedot.core.operations.evaluation.operation_implementations.data_operations.topological. \ topological_extractor import TopologicalFeaturesImplementation from fedot.core.operations.operation_parameters import OperationParameters @@ -47,7 +50,8 @@ class FedotPreprocessingStrategy(EvaluationStrategy): 'one_hot_encoding': OneHotEncodingImplementation, 'label_encoding': LabelEncodingImplementation, 'fast_ica': FastICAImplementation, - 'topological_features': TopologicalFeaturesImplementation + 'topological_features': TopologicalFeaturesImplementation, + 'fast_topological_features': FastTopologicalFeaturesImplementation, } def __init__(self, operation_type: str, params: Optional[OperationParameters] = None): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/topological/fast_topological_extractor.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/topological/fast_topological_extractor.py new file mode 100644 index 0000000000..75ad31ab39 --- /dev/null +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/topological/fast_topological_extractor.py @@ -0,0 +1,59 @@ +from itertools import chain +from typing import Optional + +import numpy as np +from gph import ripser_parallel as ripser +from joblib import Parallel, delayed + +from fedot.core.data.data import InputData, OutputData +from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \ + DataOperationImplementation +from fedot.core.operations.operation_parameters import OperationParameters + + +class FastTopologicalFeaturesImplementation(DataOperationImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.window_size_as_share = params.get('window_size_as_share') + self.max_homology_dimension = params.get('max_homology_dimension') + self.metric = params.get('metric') + self.stride = params.get('stride') + self.n_jobs = params.get('n_jobs') + self.quantiles = (0.1, 0.25, 0.5, 0.75, 0.9) + self._shape = len(self.quantiles) + self._window_size = None + + def fit(self, input_data: InputData): + self._window_size = int(input_data.features.shape[1] * self.window_size_as_share) + self._window_size = max(self._window_size, 2) + self._window_size = min(self._window_size, input_data.features.shape[1] - 2) + return self + + def transform(self, input_data: InputData) -> OutputData: + features = input_data.features + with Parallel(n_jobs=self.n_jobs, prefer='processes') as parallel: + topological_features = parallel(delayed(self._extract_features) + (np.mean(features[i:i+2, ::self.stride], axis=0)) + for i in range(0, features.shape[0], 2)) + if len(topological_features) * 2 < features.shape[0]: + topological_features.append(topological_features[-1]) + result = np.array(list(chain(*zip(topological_features, topological_features)))) + if result.shape[0] > features.shape[0]: + result = result[:-1, :] + np.nan_to_num(result, copy=False, nan=0, posinf=0, neginf=0) + return result + + def _extract_features(self, x): + x_sliced = np.array([x[i:self._window_size + i] for i in range(x.shape[0] - self._window_size + 1)]) + x_processed = ripser(x_sliced, + maxdim=self.max_homology_dimension, + coeff=2, + metric=self.metric, + n_threads=1, + collapse_edges=False)["dgms"] + result = np.zeros(self._shape * (self.max_homology_dimension + 1)) + for i, xp in enumerate(x_processed): + if xp.shape[0] > 0: + result[i * self._shape:(i + 1) * self._shape] = np.quantile(xp[:, 1] - xp[:, 0], self.quantiles, + overwrite_input=True, method='hazen') + return result diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py index 08b67ab60f..b44a02b977 100644 --- a/fedot/core/pipelines/tuning/search_space.py +++ b/fedot/core/pipelines/tuning/search_space.py @@ -768,6 +768,22 @@ def get_parameters_dict(self): 'sampling-scope': [0.9, 0.99], 'type': 'continuous'} }, + 'fast_topological_features': { + 'window_size_as_share': { + 'hyperopt-dist': hp.uniform, + 'sampling-scope': [0.1, 0.9], + 'type': 'continuous' + }, + 'max_homology_dimension': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [1, 5], + 'type': 'discrete' + }, + 'metric': { + 'hyperopt-dist': hp.choice, + 'sampling-scope': [['euclidean', 'manhattan', 'cosine']], + 'type': 'categorical'} + } } if self.custom_search_space is not None: diff --git a/fedot/core/repository/data/data_operation_repository.json b/fedot/core/repository/data/data_operation_repository.json index 6fe73f19b2..e171b36b96 100644 --- a/fedot/core/repository/data/data_operation_repository.json +++ b/fedot/core/repository/data/data_operation_repository.json @@ -256,6 +256,20 @@ "presets": [ "ts" ], + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", + "tags": [ + "non_applicable_for_ts", + "feature_space_transformation" + ] + }, + "fast_topological_features": { + "meta": "custom_ts_preprocessing", + "presets": [ + "ts" + ], + "input_type": "[DataTypesEnum.table]", + "output_type": "[DataTypesEnum.table]", "tags": [ "non_applicable_for_ts", "feature_space_transformation" diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json index 431ccf2d05..a6abf66cb0 100644 --- a/fedot/core/repository/data/default_operation_params.json +++ b/fedot/core/repository/data/default_operation_params.json @@ -160,5 +160,12 @@ }, "topological_features": { "n_jobs": -1 + }, + "fast_topological_features": { + "n_jobs": 1, + "window_size_as_share": 0.66, + "max_homology_dimension": 1, + "metric": "euclidean", + "stride": 1 } } \ No newline at end of file diff --git a/test/integration/models/test_model.py b/test/integration/models/test_model.py index 6216c2ab95..09873db356 100644 --- a/test/integration/models/test_model.py +++ b/test/integration/models/test_model.py @@ -54,16 +54,16 @@ def get_data_for_testing(task_type, data_type, length=100, features_count=1, return None if task_type is TaskTypesEnum.ts_forecasting: - task = Task(task_type, TsForecastingParams(max(length // 10, 2))) + forecast_length = max(length // 10, 2) + task = Task(task_type, TsForecastingParams(forecast_length)) if data_type is DataTypesEnum.ts: features = np.zeros(length) + value else: features = np.zeros((length, features_count)) + value if data_type is DataTypesEnum.table: - target = np.zeros(length) + value + target = np.zeros((length, forecast_length)) + value else: target = features - else: task = Task(task_type) data_type = DataTypesEnum.table @@ -156,11 +156,15 @@ def fit_time_for_operation(operation: OperationMetaInfo, return perf_counter() - start_time for task_type in operation.task_type: - for data_type in operation.input_types: + input_types = operation.input_types + if task_type is TaskTypesEnum.ts_forecasting: + if operation.input_types == [DataTypesEnum.table]: + input_types = [DataTypesEnum.ts] + for data_type in input_types: perfomance_values = [] for length in data_lengths: data = get_data_for_testing(task_type, data_type, - length=length, features_count=2, + length=length, features_count=10, random=True) if data is not None: min_evaluated_time = min(fit_time_for_operation(operation, data) for _ in range(times))