From f2cacc2f42958d3cbca01681711cf3c052125a83 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 21 Nov 2022 23:14:34 +0300 Subject: [PATCH 01/72] accelerated define_column_types --- fedot/preprocessing/data_types.py | 43 ++++++++++--------------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 5f8b8557c0..bff1bd649b 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -412,21 +412,14 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): features_types[column_id] = NAME_CLASS_FLOAT -def define_column_types(table: np.array): +def define_column_types(table: np.ndarray): """ Prepare information about types per columns. For each column store unique types, which column contains. If column with mixed type contain str object additional field 'str_ids' with indices of string objects is prepared """ - - # TODO: current processing is relatively computationally expensive - probably refactor needed - - def type_ignoring_nans(item): - """ Return type of element in the array. If item is np.nan - return NoneType """ - current_type = type(item) - if current_type is float and np.isnan(item): - # Check is current element is nan or not (np.nan is a float type) - return type(None) - return current_type + def to_type(item): + return str(type(item)) + vto_type = np.vectorize(to_type) if table is None: return {} @@ -436,34 +429,24 @@ def type_ignoring_nans(item): for column_id in range(n_columns): current_column = table[:, column_id] - # Check every element in numpy array - it can take a long time! - column_types = list(map(type_ignoring_nans, current_column)) - - # Store only unique values - set_column_types = set(column_types) - # Convert types into string names - column_types_names = list(map(str, set_column_types)) + column_types = np.where(pd.isna(current_column), str(type(None)), vto_type(current_column)) - if len(column_types_names) > 1: - # There are several types in one column - types_names = np.array(column_types, dtype=str) - # Calculate number of string objects in the dataset - str_number = len(np.argwhere(types_names == NAME_CLASS_STR)) - int_number = len(np.argwhere(types_names == NAME_CLASS_INT)) - float_number = len(np.argwhere(types_names == NAME_CLASS_FLOAT)) + if len(np.unique(column_types)) > 1: + str_number = (column_types == NAME_CLASS_STR).sum() + int_number = (column_types == NAME_CLASS_INT).sum() + float_number = (column_types == NAME_CLASS_FLOAT).sum() # Store information about nans in the target - nan_ids = np.ravel(np.argwhere(types_names == NAME_CLASS_NONE)) - nan_number = len(nan_ids) - columns_info.update({column_id: {'types': column_types_names, + nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE)) # TODO: maybe just convert to list to preserve idx pairs? + columns_info.update({column_id: {'types': column_types, 'str_number': str_number, 'int_number': int_number, 'float_number': float_number, - 'nan_number': nan_number, + 'nan_number': len(nan_ids), 'nan_ids': nan_ids}}) else: # There is only one type, or several types such as int and float - columns_info.update({column_id: {'types': column_types_names}}) + columns_info.update({column_id: {'types': column_types}}) return columns_info From 4a5a2cfbcc8f65df654359c9bec72288123c1405 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 22 Nov 2022 12:44:54 +0300 Subject: [PATCH 02/72] hotfix for pytests --- fedot/preprocessing/data_types.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index bff1bd649b..b426b2267a 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -430,15 +430,16 @@ def to_type(item): current_column = table[:, column_id] column_types = np.where(pd.isna(current_column), str(type(None)), vto_type(current_column)) + unique_column_types = np.unique(column_types) - if len(np.unique(column_types)) > 1: + if len(unique_column_types) > 1: str_number = (column_types == NAME_CLASS_STR).sum() int_number = (column_types == NAME_CLASS_INT).sum() float_number = (column_types == NAME_CLASS_FLOAT).sum() # Store information about nans in the target nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE)) # TODO: maybe just convert to list to preserve idx pairs? - columns_info.update({column_id: {'types': column_types, + columns_info.update({column_id: {'types': unique_column_types, 'str_number': str_number, 'int_number': int_number, 'float_number': float_number, @@ -446,7 +447,7 @@ def to_type(item): 'nan_ids': nan_ids}}) else: # There is only one type, or several types such as int and float - columns_info.update({column_id: {'types': column_types}}) + columns_info.update({column_id: {'types': unique_column_types}}) return columns_info From c0bff917ee9097264bcded33ce2ad17eba2c2063 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 22 Nov 2022 16:49:52 +0300 Subject: [PATCH 03/72] accelerated _clean_extra_spaces --- fedot/preprocessing/preprocessing.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 3e1cce5374..d541f94f98 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -357,10 +357,18 @@ def _clean_extra_spaces(data: InputData) -> InputData: Returns: cleaned ``data`` """ - features = pd.DataFrame(data.features) - features = features.applymap(lambda x: x.strip() if isinstance(x, str) else x) - - data.features = np.array(features) + def strip_all_strs(item: Union[object, str]): + try: + return item.strip() + except AttributeError: + # not an str object + return item + + features_df = pd.DataFrame(data.features) + mixed_or_str = features_df.select_dtypes(object) + features_df[mixed_or_str.columns] = mixed_or_str.applymap(strip_all_strs) + + data.features = features_df.to_numpy() return data @copy_doc(BasePreprocessor.label_encoding_for_fit) From 986d5348954ba1940795adbdbf5fc879d9745538 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 23 Nov 2022 16:55:15 +0300 Subject: [PATCH 04/72] convert num col to str optimized --- fedot/preprocessing/data_types.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index b426b2267a..91eeeba6bf 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -494,17 +494,10 @@ def type_by_name(current_type_name: str): def convert_num_column_into_string_array(numerical_column: pd.Series) -> np.array: """ Convert pandas column into numpy one-dimensional array """ - # Convert into string - converted_column = numerical_column.astype(str) - converted_array = converted_column.values - - # If there are nans - insert them - nan_ids = np.ravel(np.argwhere(converted_array == 'nan')) - if len(nan_ids) > 0: - converted_array = converted_array.astype(object) - converted_array[nan_ids] = np.nan - - return converted_array + # convert only non-nans values + true_nums = numerical_column[numerical_column.notna()] + numerical_column[true_nums.index] = true_nums.astype(str, copy=False) + return numerical_column.to_numpy() def _obtain_new_column_type(column_info): From c594f81235da761ba19ed35134a139411b9b6df3 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 23 Nov 2022 17:46:27 +0300 Subject: [PATCH 05/72] type inference fixes --- fedot/preprocessing/data_types.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 91eeeba6bf..4ccb62b4ec 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -9,7 +9,7 @@ from fedot.core.repository.tasks import Task, TaskTypesEnum NoneType = type(None) -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Tuple if TYPE_CHECKING: from fedot.core.data.data import InputData @@ -281,7 +281,7 @@ def _convert_feature_into_one_type(self, mixed_column: np.array, column_info: di return None, 'removed' def _convert_target_into_one_type(self, mixed_column: np.array, column_info: dict, mixed_column_id: int, - task: Task) -> [np.array, str]: + task: Task) -> Tuple[np.ndarray, str]: """ Convert target columns into one type based on column proportions of object and task """ if task.task_type is TaskTypesEnum.classification: # For classification labels are string if at least one element is a string @@ -309,7 +309,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - n_rows, n_cols = data.features.shape + _, n_cols = data.features.shape for column_id in range(n_cols): # For every int/float column perform check column_type = data.supplementary_data.column_types['features'][column_id] @@ -339,7 +339,7 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) # There is no transformation for current table return data - n_rows, n_cols = data.features.shape + _, n_cols = data.features.shape for column_id in range(n_cols): if column_id in self.numerical_into_str: numerical_column = pd.Series(data.features[:, column_id]) From 54d10eedb3191b43bbcf7fe0324ed86687c1f2c5 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 24 Nov 2022 18:42:45 +0300 Subject: [PATCH 06/72] categorical.py/data_preprocessing.py refactored --- fedot/core/data/data_preprocessing.py | 63 ++++++++++---------------- fedot/preprocessing/categorical.py | 65 +++++++++++---------------- 2 files changed, 50 insertions(+), 78 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 6d509f67c2..2bdba0c1ce 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -1,35 +1,35 @@ import numpy as np import pandas as pd +from typing import Tuple, Optional + from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts from fedot.core.repository.dataset_types import DataTypesEnum -def data_type_is_suitable_preprocessing(data: InputData) -> bool: - if data_type_is_table(data) or data_type_is_ts(data) or data_type_is_multi_ts(data): - return True - return False +def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: + return data_type_is_table(data) or data_type_is_ts(data) or data_type_is_multi_ts(data) def replace_inf_with_nans(input_data: InputData): - values_to_replace = [np.inf, -np.inf] - features_with_replaced_inf = np.where(np.isin(input_data.features, - values_to_replace), - np.nan, - input_data.features) - input_data.features = features_with_replaced_inf + features = input_data.features + if features.dtype == object: + print(features[:2]) + try: + features[(features == np.inf) | (features == -np.inf)] = np.nan + except Exception as exc: + print("PROBLEM DTYPE", features.dtype, exc, features) + raise def replace_nans_with_empty_strings(input_data: InputData): """ Replace NaNs with empty strings in input_data.features """ - input_data.features = np.where(pd.isna(input_data.features), - '', - input_data.features) + input_data.features[pd.isna(input_data.features)] = '' -def convert_into_column(array: np.array): +def convert_into_column(array: np.ndarray) -> np.ndarray: """ Perform conversion for data if it is necessary """ if len(array.shape) == 1: return array.reshape(-1, 1) @@ -38,7 +38,7 @@ def convert_into_column(array: np.array): def divide_data_categorical_numerical(input_data: InputData, categorical_ids: list, - non_categorical_ids: list) -> (InputData, InputData): + non_categorical_ids: list) -> Tuple[Optional[InputData], Optional[InputData]]: """ Split tabular InputData into two parts: with numerical and categorical features using list with ids of categorical and numerical features. @@ -65,7 +65,7 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li raise ValueError(f'{prefix} Check data for Nans and inf values') -def find_categorical_columns(table: np.array, column_types: dict = None): +def find_categorical_columns(table: np.ndarray, column_types: dict = None): """ Method for finding categorical and non-categorical columns in tabular data @@ -89,29 +89,16 @@ def find_categorical_columns(table: np.array, column_types: dict = None): return categorical_ids, non_categorical_ids -def force_categorical_determination(table): +def force_categorical_determination(table: np.ndarray): """ Find string columns using 'computationally expensive' approach """ - source_shape = table.shape - columns_number = source_shape[1] if len(source_shape) > 1 else 1 - categorical_ids = [] non_categorical_ids = [] - # For every column in table make check for first element - for column_id in range(0, columns_number): - column = table[:, column_id] if columns_number > 1 else table - col_shape = column.shape - for i in column: - # Check if element is string object or not until the first appearance - if len(col_shape) == 2 and isinstance(i[0], str): - # Column looks like [[n], [n], [n]] - categorical_ids.append(column_id) - break - elif len(col_shape) == 1 and isinstance(i, str): - # Column [n, n, n] - categorical_ids.append(column_id) - break - - if column_id not in categorical_ids: + # For every column in table make check + for column_id, column in enumerate(table.T): + # Check if column is of string objects + if pd.api.types.infer_dtype(column, skipna=True) == 'string': + categorical_ids.append(column_id) + else: non_categorical_ids.append(column_id) return categorical_ids, non_categorical_ids @@ -119,9 +106,7 @@ def force_categorical_determination(table): def data_has_missing_values(data: InputData) -> bool: """ Check data for missing values.""" - if data_type_is_suitable_preprocessing(data): - return pd.DataFrame(data.features).isna().sum().sum() > 0 - return False + return data_type_is_suitable_for_preprocessing(data) and pd.DataFrame(data.features).isna().sum().sum() > 0 def data_has_categorical_features(data: InputData) -> bool: diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 13bfdf5fef..1d7ecfc420 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -2,6 +2,8 @@ import numpy as np import pandas as pd +from typing import Tuple + from sklearn.preprocessing import LabelEncoder from fedot.core.data.data import InputData @@ -25,39 +27,35 @@ def fit(self, input_data: InputData): has str objects. If there are such features - convert it into int """ features_types = input_data.supplementary_data.column_types['features'] - categorical_ids, non_categorical_ids = find_categorical_columns(table=input_data.features, - column_types=features_types) + categorical_ids, _ = find_categorical_columns(table=input_data.features, + column_types=features_types) if len(categorical_ids) == 0: # There is no need to process categorical features return self binary_ids_to_convert = [] number_of_columns = input_data.features.shape[-1] - for column_id, number in enumerate(range(number_of_columns)): - column = np.array(input_data.features[:, column_id]) - - # Numpy with strings cannot be processed for nans search - so use pandas - pd_column = pd.Series(column) - is_row_has_nan = pd.isna(pd_column) - nans_number = is_row_has_nan.sum() - if nans_number > 0 and column_id in categorical_ids: + for column_id in range(number_of_columns): + pd_column = pd.Series(input_data.features[:, column_id], copy=True) + has_nan = pd_column.isna() + if has_nan.sum() and column_id in categorical_ids: # This categorical column has nans - column, gap_ids = replace_nans_with_fedot_nans(column, is_row_has_nan) - column_uniques = np.unique(column) + replaced_column, _ = replace_nans_with_fedot_nans(pd_column, has_nan) + column_uniques = replaced_column.unique() if len(column_uniques) <= 3: # There is column with binary categories and gaps self.binary_features_with_nans.append(column_id) binary_ids_to_convert.append(column_id) - self._train_encoder(column, column_id) + self._train_encoder(replaced_column, column_id) else: - column_uniques = np.unique(column) + column_uniques = pd_column.unique() if len(column_uniques) <= 2 and column_id in categorical_ids: # Column contains binary string feature binary_ids_to_convert.append(column_id) # Train encoder for current column - self._train_encoder(column, column_id) + self._train_encoder(pd_column, column_id) self.binary_ids_to_convert = binary_ids_to_convert return self @@ -72,18 +70,18 @@ def transform(self, input_data: InputData) -> InputData: converted_features = [] number_of_columns = input_data.features.shape[-1] - for column_id, number in enumerate(range(number_of_columns)): + for column_id in range(number_of_columns): if column_id in self.binary_ids_to_convert: # If column contains nans - replace them with fedot nans special string - column = input_data.features[:, column_id] - is_row_has_nan = pd.isna(pd.Series(column)) - column, gap_ids = replace_nans_with_fedot_nans(column, is_row_has_nan) + pd_column = pd.Series(input_data.features[:, column_id]) + has_nan = pd_column.isna() + replaced_column, gap_ids = replace_nans_with_fedot_nans(pd_column, has_nan) # Convert into integers - converted_column = self._apply_encoder(column, column_id, gap_ids) + converted_column = self._apply_encoder(replaced_column, column_id, gap_ids) else: # Stay column the same - converted_column = np.array(input_data.features[:, column_id]) + converted_column = input_data.features[:, column_id] converted_features.append(converted_column.reshape((-1, 1))) @@ -110,7 +108,7 @@ def fit_transform(self, input_data: InputData) -> InputData: self.fit(input_data) return self.transform(input_data) - def _train_encoder(self, column: np.array, column_id: int): + def _train_encoder(self, column: pd.Series, column_id: int): """ Convert labels in the column from string into int via Label encoding. So, Label encoder is fitted to do such transformation. """ @@ -120,18 +118,11 @@ def _train_encoder(self, column: np.array, column_id: int): # Store fitted label encoder for transform method self.binary_encoders.update({column_id: encoder}) - def _apply_encoder(self, column: np.array, column_id: int, gap_ids: np.array) -> np.array: + def _apply_encoder(self, column: pd.Series, column_id: int, gap_ids: pd.Series) -> np.ndarray: """ Apply already fitted encoders """ encoder = self.binary_encoders[column_id] - encoder_classes = list(encoder.classes_) - - # If the column contains categories not previously encountered - for label in list(set(column)): - if label not in encoder_classes: - encoder_classes.append(label) - - # Extent encoder classes - encoder.classes_ = np.array(encoder_classes) + # Extend encoder classes if the column contains categories not previously encountered + encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) converted = encoder.transform(column) if len(gap_ids) > 0: @@ -142,11 +133,7 @@ def _apply_encoder(self, column: np.array, column_id: int, gap_ids: np.array) -> return converted -def replace_nans_with_fedot_nans(column: np.array, is_row_has_nan): - # There are nans in the columns - find indices of such objects - # True > 0 - gap_ids = np.ravel(np.argwhere(is_row_has_nan.values > 0)) - +def replace_nans_with_fedot_nans(column: pd.Series, has_nan: pd.Series) -> Tuple[pd.Series, pd.Series]: # Add new category - 'fedot_nan' after converting it will be replaced by nans - column[gap_ids] = FEDOT_STR_NAN - return column, gap_ids + column[has_nan] = FEDOT_STR_NAN + return column, has_nan From fcabc4df905337ba44b179701c38a5f199af8bc5 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Fri, 25 Nov 2022 17:12:27 +0300 Subject: [PATCH 07/72] fixed replacing inf with nan --- fedot/core/data/data_preprocessing.py | 10 +++------- fedot/preprocessing/categorical.py | 3 +-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 2bdba0c1ce..9c2ae5fa83 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -13,13 +13,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: def replace_inf_with_nans(input_data: InputData): features = input_data.features - if features.dtype == object: - print(features[:2]) - try: - features[(features == np.inf) | (features == -np.inf)] = np.nan - except Exception as exc: - print("PROBLEM DTYPE", features.dtype, exc, features) - raise + has_infs = (features == np.inf) | (features == -np.inf) + if np.any(has_infs): + features[has_infs] = np.nan def replace_nans_with_empty_strings(input_data: InputData): diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 1d7ecfc420..d75541f989 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -1,9 +1,8 @@ from copy import deepcopy +from typing import Tuple import numpy as np import pandas as pd -from typing import Tuple - from sklearn.preprocessing import LabelEncoder from fedot.core.data.data import InputData From cf9447a998932e1a765e40a1d3cd4d318a2a5b9d Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 28 Nov 2022 13:32:19 +0300 Subject: [PATCH 08/72] label encoder same refactoring --- .../data_operations/categorical_encoders.py | 30 +++++++------------ 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 055655f0f8..2adfdf77a9 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -1,13 +1,16 @@ from copy import deepcopy -from typing import Optional, Union +from typing import Optional import numpy as np -from sklearn.preprocessing import OneHotEncoder, LabelEncoder +import pandas as pd + +from sklearn.preprocessing import LabelEncoder, OneHotEncoder from fedot.core.data.data import InputData, OutputData from fedot.core.data.data_preprocessing import find_categorical_columns -from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \ +from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ( DataOperationImplementation +) from fedot.core.operations.operation_parameters import OperationParameters @@ -85,7 +88,7 @@ def _update_column_types(self, output_data: OutputData): output_data.encoded_idx = self.encoded_ids output_data.supplementary_data.column_types['features'] = numerical_columns - def _apply_one_hot_encoding(self, features: np.array) -> np.array: + def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray: """ The method creates a table based on categorical and real features after One Hot Encoding transformation @@ -139,10 +142,7 @@ def transform(self, input_data: InputData) -> OutputData: # If categorical features are exists - transform them inplace in InputData for categorical_id in self.categorical_ids: categorical_column = input_data.features[:, categorical_id] - - # Converting into string - so nans becomes marked as 'nan' - categorical_column = categorical_column.astype(str) - gap_ids = np.ravel(np.argwhere(categorical_column == 'nan')) + gap_ids = pd.isna(categorical_column) transformed = self._apply_label_encoder(categorical_column, categorical_id, gap_ids) copied_data.features[:, categorical_id] = transformed @@ -172,8 +172,8 @@ def _fit_label_encoders(self, input_data: InputData): self.encoders.update({categorical_id: le}) - def _apply_label_encoder(self, categorical_column: np.array, categorical_id: int, - gap_ids: Union[np.array, None]) -> np.array: + def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int, + gap_ids: np.ndarray) -> np.ndarray: """ Apply fitted LabelEncoder for column transformation :param categorical_column: numpy array with categorical features @@ -181,15 +181,7 @@ def _apply_label_encoder(self, categorical_column: np.array, categorical_id: int :param gap_ids: indices of gap elements in array """ column_encoder = self.encoders[categorical_id] - encoder_classes = list(column_encoder.classes_) - - # If the column contains categories not previously encountered - for label in sorted(list(set(categorical_column))): - if label not in encoder_classes: - encoder_classes.append(label) - - # Extent encoder classes - column_encoder.classes_ = np.array(encoder_classes) + column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, categorical_column))) transformed_column = column_encoder.transform(categorical_column) if len(gap_ids) > 0: From 7431c986c73214f2f0676c05df7c91fdb9e4e013 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 28 Nov 2022 21:36:29 +0300 Subject: [PATCH 09/72] logical fix in label encoder --- .../data_operations/categorical_encoders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 2adfdf77a9..641c22c69d 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -142,12 +142,12 @@ def transform(self, input_data: InputData) -> OutputData: # If categorical features are exists - transform them inplace in InputData for categorical_id in self.categorical_ids: categorical_column = input_data.features[:, categorical_id] - gap_ids = pd.isna(categorical_column) + gap_ids: np.ndarray = pd.isna(categorical_column) transformed = self._apply_label_encoder(categorical_column, categorical_id, gap_ids) copied_data.features[:, categorical_id] = transformed - output_data = self._convert_to_output(input_data, + output_data = self._convert_to_output(copied_data, copied_data.features) self._update_column_types(output_data) From a8b9c90fa2917646eb2688c2dc62c59c7a9c5e98 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 30 Nov 2022 15:05:05 +0300 Subject: [PATCH 10/72] nans with cats in unique func fix --- .../data_operations/categorical_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 641c22c69d..3ed53f2fbb 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -181,7 +181,7 @@ def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: i :param gap_ids: indices of gap elements in array """ column_encoder = self.encoders[categorical_id] - column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, categorical_column))) + column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column))) transformed_column = column_encoder.transform(categorical_column) if len(gap_ids) > 0: From 243df8e6ef89ffd0f4422e9150d14a043010f110 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 6 Dec 2022 11:27:07 +0300 Subject: [PATCH 11/72] types fixes --- fedot/core/data/data.py | 2 ++ fedot/preprocessing/data_types.py | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index fc071d5a99..0247638e96 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -2,12 +2,14 @@ import glob import os + from copy import copy, deepcopy from dataclasses import dataclass, field from typing import Any, Iterable, List, Optional, Tuple, Union import numpy as np import pandas as pd + from golem.core.log import default_log from golem.utilities.requirements_notificator import warn_requirement diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 4ccb62b4ec..6d48c28191 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -110,7 +110,7 @@ def convert_data_for_predict(self, data: InputData): self._retain_columns_info_without_types_conflicts(data) return data - def remove_incorrect_features(self, table: np.array, converted_columns: dict): + def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict): """ Remove from the table columns with conflicts with types were not resolved @@ -130,7 +130,7 @@ def remove_incorrect_features(self, table: np.array, converted_columns: dict): table = np.delete(table, self.columns_to_del, 1) return table - def features_types_converting(self, features: np.array) -> np.array: + def features_types_converting(self, features: np.ndarray) -> np.array: """ Convert all elements in the data in every feature column into one type :param features: tabular features array @@ -157,7 +157,7 @@ def features_types_converting(self, features: np.array) -> np.array: return features - def target_types_converting(self, target: np.array, task: Task) -> np.array: + def target_types_converting(self, target: np.ndarray, task: Task) -> np.array: """ Convert all elements in every target column into one type :param target: tabular target array @@ -185,7 +185,7 @@ def target_types_converting(self, target: np.array, task: Task) -> np.array: return target - def prepare_column_types_info(self, predictors: np.array, target: np.array = None, + def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = None, task: Task = None) -> dict: """ Prepare information about columns in a form of dictionary Dictionary has two keys: 'target' and 'features' @@ -224,7 +224,7 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): remained_column_types.append(col) data.supplementary_data.column_types['features'] = remained_column_types - def _check_columns_vs_types_number(self, table: np.array, column_types: list): + def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list): # Check if columns number correct n_rows, n_cols = table.shape if n_cols != len(column_types): @@ -244,7 +244,7 @@ def _remove_pseudo_str_values_from_str_column(data: pd.DataFrame, column_id: int converted_column.append(cur_column[i]) data.features[:, column_id] = pd.Series(converted_column).values - def _convert_feature_into_one_type(self, mixed_column: np.array, column_info: dict, mixed_column_id: int): + def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int): """ Determine new type for current feature column based on the string ratio. And then convert column into it. :param mixed_column: one-dimensional array with several data types @@ -280,7 +280,7 @@ def _convert_feature_into_one_type(self, mixed_column: np.array, column_info: di self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.') return None, 'removed' - def _convert_target_into_one_type(self, mixed_column: np.array, column_info: dict, mixed_column_id: int, + def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int, task: Task) -> Tuple[np.ndarray, str]: """ Convert target columns into one type based on column proportions of object and task """ if task.task_type is TaskTypesEnum.classification: @@ -424,7 +424,7 @@ def to_type(item): if table is None: return {} - n_rows, n_columns = table.shape + _, n_columns = table.shape columns_info = {} for column_id in range(n_columns): current_column = table[:, column_id] @@ -438,7 +438,7 @@ def to_type(item): float_number = (column_types == NAME_CLASS_FLOAT).sum() # Store information about nans in the target - nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE)) # TODO: maybe just convert to list to preserve idx pairs? + nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE)) columns_info.update({column_id: {'types': unique_column_types, 'str_number': str_number, 'int_number': int_number, @@ -462,7 +462,7 @@ def find_mixed_types_columns(columns_info: dict): return columns_with_mixed_types -def apply_type_transformation(table: np.array, column_types: list, log: LoggerAdapter): +def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter): """ Apply transformation for columns in dataset into desired type. Perform transformation on predict stage when column types were already determined @@ -510,7 +510,7 @@ def _obtain_new_column_type(column_info): return int -def _convert_predict_column_into_desired_type(table: np.array, current_column: np.array, +def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray, column_id: int, current_type, log: LoggerAdapter): try: table[:, column_id] = current_column.astype(current_type) From b6d5e77311ef8dce139c2a7f71a976748182016b Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 12 Dec 2022 16:03:59 +0300 Subject: [PATCH 12/72] minor improvements --- fedot/core/data/data_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 9c2ae5fa83..c0b411a73b 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -102,7 +102,7 @@ def force_categorical_determination(table: np.ndarray): def data_has_missing_values(data: InputData) -> bool: """ Check data for missing values.""" - return data_type_is_suitable_for_preprocessing(data) and pd.DataFrame(data.features).isna().sum().sum() > 0 + return data_type_is_suitable_for_preprocessing(data) and pd.DataFrame(data.features).isna().to_numpy().sum() > 0 def data_has_categorical_features(data: InputData) -> bool: From 21f4ce495878469c18eb635ebfe1af95046bc20a Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 14 Dec 2022 15:17:52 +0300 Subject: [PATCH 13/72] minor conversation fix from PR --- fedot/core/data/data_preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index c0b411a73b..6272463009 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -1,3 +1,5 @@ +from typing import Tuple, Optional + import numpy as np import pandas as pd @@ -132,6 +134,4 @@ def data_has_text_features(data: InputData) -> bool: Returns bool, whether data has text fields or not """ # TODO andreygetmanov: make compatible with current text checking - if data.data_type is DataTypesEnum.text: - return True - return False + return data.data_type is DataTypesEnum.text From 001d8b18a4099b502627e3953559e10344e50585 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 15 Dec 2022 13:59:23 +0300 Subject: [PATCH 14/72] fix format + rename semantically --- fedot/core/data/data_preprocessing.py | 2 -- .../data_operations/categorical_encoders.py | 12 ++++++------ .../implementation_interfaces.py | 2 +- fedot/preprocessing/categorical.py | 10 +++++----- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 6272463009..5d25b1463d 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -3,8 +3,6 @@ import numpy as np import pandas as pd -from typing import Tuple, Optional - from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts from fedot.core.repository.dataset_types import DataTypesEnum diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 3ed53f2fbb..fa09cb1b23 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -142,9 +142,9 @@ def transform(self, input_data: InputData) -> OutputData: # If categorical features are exists - transform them inplace in InputData for categorical_id in self.categorical_ids: categorical_column = input_data.features[:, categorical_id] - gap_ids: np.ndarray = pd.isna(categorical_column) + has_nan: np.ndarray = pd.isna(categorical_column) - transformed = self._apply_label_encoder(categorical_column, categorical_id, gap_ids) + transformed = self._apply_label_encoder(categorical_column, categorical_id, has_nan) copied_data.features[:, categorical_id] = transformed output_data = self._convert_to_output(copied_data, @@ -173,21 +173,21 @@ def _fit_label_encoders(self, input_data: InputData): self.encoders.update({categorical_id: le}) def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int, - gap_ids: np.ndarray) -> np.ndarray: + has_nan: np.ndarray) -> np.ndarray: """ Apply fitted LabelEncoder for column transformation :param categorical_column: numpy array with categorical features :param categorical_id: index of current categorical column - :param gap_ids: indices of gap elements in array + :param has_nan: bool array of gap elements in the ``categorical_column`` """ column_encoder = self.encoders[categorical_id] column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column))) transformed_column = column_encoder.transform(categorical_column) - if len(gap_ids) > 0: + if len(has_nan) > 0: # Store np.nan values transformed_column = transformed_column.astype(object) - transformed_column[gap_ids] = np.nan + transformed_column[has_nan] = np.nan return transformed_column diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index f092fe671b..6e4703a6a5 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -219,7 +219,7 @@ def _convert_to_output(input_data: InputData, predict: np.array, return converted -def _convert_to_output_function(input_data: InputData, transformed_features: np.array, +def _convert_to_output_function(input_data: InputData, transformed_features: np.ndarray, data_type: DataTypesEnum = DataTypesEnum.table): """ Function prepare prediction of operation as OutputData object diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index d75541f989..e31937c8f6 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -74,10 +74,10 @@ def transform(self, input_data: InputData) -> InputData: # If column contains nans - replace them with fedot nans special string pd_column = pd.Series(input_data.features[:, column_id]) has_nan = pd_column.isna() - replaced_column, gap_ids = replace_nans_with_fedot_nans(pd_column, has_nan) + replaced_column, has_nan = replace_nans_with_fedot_nans(pd_column, has_nan) # Convert into integers - converted_column = self._apply_encoder(replaced_column, column_id, gap_ids) + converted_column = self._apply_encoder(replaced_column, column_id, has_nan) else: # Stay column the same converted_column = input_data.features[:, column_id] @@ -117,17 +117,17 @@ def _train_encoder(self, column: pd.Series, column_id: int): # Store fitted label encoder for transform method self.binary_encoders.update({column_id: encoder}) - def _apply_encoder(self, column: pd.Series, column_id: int, gap_ids: pd.Series) -> np.ndarray: + def _apply_encoder(self, column: pd.Series, column_id: int, has_nan: pd.Series) -> np.ndarray: """ Apply already fitted encoders """ encoder = self.binary_encoders[column_id] # Extend encoder classes if the column contains categories not previously encountered encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) converted = encoder.transform(column) - if len(gap_ids) > 0: + if len(has_nan) > 0: # Column has nans in its structure - after conversion replace it converted = converted.astype(float) - converted[gap_ids] = np.nan + converted[has_nan] = np.nan return converted From 267f704c2f0249776e93bb06585719a5b4604bbd Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 29 Dec 2022 15:28:53 +0300 Subject: [PATCH 15/72] PR fixes * rid of str variables for types in preprocessor * improved define column_types function in data_types.py --- fedot/core/data/data_preprocessing.py | 9 +- .../data_operations/categorical_encoders.py | 11 +- .../sklearn_transformations.py | 8 +- .../data_operations/ts_transformations.py | 5 +- fedot/core/operations/model.py | 137 +++++++++--------- fedot/preprocessing/categorical.py | 4 +- fedot/preprocessing/data_types.py | 115 ++++++++------- fedot/preprocessing/preprocessing.py | 7 +- test/unit/data/test_supplementary_data.py | 12 +- .../test_data_operations_implementations.py | 27 ++-- .../test_preprocessing_through_api.py | 4 +- test/unit/preprocessing/test_preprocessors.py | 13 +- 12 files changed, 184 insertions(+), 168 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 5d25b1463d..9f4455d87b 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -5,6 +5,7 @@ from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.preprocessing.data_types import TYPE_TO_ID def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: @@ -76,11 +77,11 @@ def find_categorical_columns(table: np.ndarray, column_types: dict = None): categorical_ids = [] non_categorical_ids = [] - for column_id, type_name in enumerate(column_types): - if 'str' in str(type_name): - categorical_ids.append(column_id) + for col_id, col_type_id in enumerate(column_types): + if col_type_id == TYPE_TO_ID[str]: + categorical_ids.append(col_id) else: - non_categorical_ids.append(column_id) + non_categorical_ids.append(col_id) return categorical_ids, non_categorical_ids diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index fa09cb1b23..58ee267622 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -12,6 +12,7 @@ DataOperationImplementation ) from fedot.core.operations.operation_parameters import OperationParameters +from fedot.preprocessing.data_types import TYPE_TO_ID class OneHotEncodingImplementation(DataOperationImplementation): @@ -35,9 +36,9 @@ def fit(self, input_data: InputData): :return encoder: trained encoder (optional output) """ features = input_data.features - features_types = input_data.supplementary_data.column_types.get('features') + features_type_ids = input_data.supplementary_data.column_types.get('features') categorical_ids, non_categorical_ids = find_categorical_columns(features, - features_types) + features_type_ids) # Indices of columns with categorical and non-categorical features self.categorical_ids = categorical_ids @@ -79,11 +80,11 @@ def _update_column_types(self, output_data: OutputData): if self.categorical_ids: # There are categorical features in the table col_types = output_data.supplementary_data.column_types['features'] - numerical_columns = [t_name for t_name in col_types if 'str' not in t_name] + numerical_columns = [t_name for t_name in col_types if t_name != TYPE_TO_ID[str]] # Calculate new binary columns number after encoding encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns) - numerical_columns.extend([str(int)] * encoded_columns_number) + numerical_columns.extend([TYPE_TO_ID[int]] * encoded_columns_number) output_data.encoded_idx = self.encoded_ids output_data.supplementary_data.column_types['features'] = numerical_columns @@ -159,7 +160,7 @@ def _update_column_types(self, output_data: OutputData): # Categorical features were in the dataset col_types = output_data.supplementary_data.column_types['features'] for categorical_id in self.categorical_ids: - col_types[categorical_id] = str(int) + col_types[categorical_id] = TYPE_TO_ID[int] output_data.supplementary_data.column_types['features'] = col_types diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index b75e70076c..3b7e4b49b3 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -13,6 +13,7 @@ from fedot.core.operations.evaluation.operation_implementations. \ implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation from fedot.core.operations.operation_parameters import OperationParameters +from fedot.preprocessing.data_types import TYPE_TO_ID class ComponentAnalysisImplementation(DataOperationImplementation): @@ -87,8 +88,8 @@ def update_column_types(output_data: OutputData) -> OutputData: """Update column types after applying PCA operations """ - n_rows, n_cols = output_data.predict.shape - output_data.supplementary_data.column_types['features'] = [str(float) * n_cols] + _, n_cols = output_data.predict.shape + output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float] * n_cols] return output_data @@ -127,6 +128,7 @@ class FastICAImplementation(ComponentAnalysisImplementation): Args: params: OperationParameters with the hyperparameters """ + def __init__(self, params: Optional[OperationParameters]): super().__init__(params) self.pca = FastICA(**self.params.to_dict()) @@ -195,7 +197,7 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): if cols_number_added > 0: # There are new columns in the table col_types = output_data.supplementary_data.column_types['features'] - col_types.extend([str(float)] * cols_number_added) + col_types.extend([TYPE_TO_ID[float]] * cols_number_added) output_data.supplementary_data.column_types['features'] = col_types diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py index 6910bf0a30..74dd395fdb 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py @@ -13,6 +13,7 @@ ) from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.preprocessing.data_types import TYPE_TO_ID class LaggedImplementation(DataOperationImplementation): @@ -127,12 +128,12 @@ def _update_column_types(self, output_data: OutputData): """ features_n_rows, features_n_cols = output_data.predict.shape - features_column_types = [str(float)] * features_n_cols + features_column_types = [TYPE_TO_ID[float]] * features_n_cols column_types = {'features': features_column_types} if output_data.target is not None and len(output_data.target.shape) > 1: target_n_rows, target_n_cols = output_data.target.shape - column_types.update({'target': [str(float)] * target_n_cols}) + column_types.update({'target': [TYPE_TO_ID[float]] * target_n_cols}) output_data.supplementary_data.column_types = column_types def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array, diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py index 0d435a3ce0..8ecd309fd8 100644 --- a/fedot/core/operations/model.py +++ b/fedot/core/operations/model.py @@ -1,68 +1,69 @@ -import numpy as np - -from fedot.core.data.data import OutputData -from fedot.core.operations.operation import Operation -from fedot.core.repository.dataset_types import DataTypesEnum -from fedot.core.repository.operation_types_repository import OperationTypesRepository -from fedot.core.repository.tasks import TaskTypesEnum - - -class Model(Operation): - """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task - - Args: - operation_type: name of the model - """ - - def __init__(self, operation_type: str): - super().__init__(operation_type=operation_type) - self.operations_repo = OperationTypesRepository('model') - - @staticmethod - def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData: - """Assign types for tabular data obtained from model predictions.\n - By default, all types of model predictions for tabular data can be clearly defined - """ - if output_data.data_type is not DataTypesEnum.table: - # No column data types info for non-tabular data - return output_data - - is_regression_task = output_data.task.task_type is TaskTypesEnum.regression - is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting - - predict_shape = np.array(output_data.predict).shape - # Add information about features - if is_regression_task or is_ts_forecasting_task: - if len(predict_shape) < 2: - column_info = {'features': [str(float)] * predict_shape[0]} - else: - column_info = {'features': [str(float)] * predict_shape[1]} - else: - if len(predict_shape) < 2: - output_data.predict = output_data.predict.reshape((-1, 1)) - predict_shape = output_data.predict.shape - # Classification task or clustering - if output_mode == 'labels': - column_info = {'features': [str(int)] * predict_shape[1]} - else: - column_info = {'features': [str(float)] * predict_shape[1]} - - # Add information about target - target_shape = output_data.target.shape if output_data.target is not None else None - if target_shape is None: - # There is no target column in output data - output_data.supplementary_data.column_types = column_info - return output_data - - if is_regression_task or is_ts_forecasting_task: - if len(target_shape) > 1: - column_info.update({'target': [str(float)] * target_shape[1]}) - else: - # Array present "time series" - column_info.update({'target': [str(float)] * len(output_data.target)}) - else: - # Classification task or clustering - column_info.update({'target': [str(int)] * predict_shape[1]}) - - output_data.supplementary_data.column_types = column_info - return output_data +import numpy as np + +from fedot.core.data.data import OutputData +from fedot.core.operations.operation import Operation +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_types_repository import OperationTypesRepository +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.preprocessing.data_types import TYPE_TO_ID + + +class Model(Operation): + """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task + + Args: + operation_type: name of the model + """ + + def __init__(self, operation_type: str): + super().__init__(operation_type=operation_type) + self.operations_repo = OperationTypesRepository('model') + + @staticmethod + def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData: + """Assign types for tabular data obtained from model predictions.\n + By default, all types of model predictions for tabular data can be clearly defined + """ + if output_data.data_type is not DataTypesEnum.table: + # No column data types info for non-tabular data + return output_data + + is_regression_task = output_data.task.task_type is TaskTypesEnum.regression + is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting + + predict_shape = np.array(output_data.predict).shape + # Add information about features + if is_regression_task or is_ts_forecasting_task: + if len(predict_shape) < 2: + column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} + else: + column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + else: + if len(predict_shape) < 2: + output_data.predict = output_data.predict.reshape((-1, 1)) + predict_shape = output_data.predict.shape + # Classification task or clustering + if output_mode == 'labels': + column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]} + else: + column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + + # Add information about target + target_shape = output_data.target.shape if output_data.target is not None else None + if target_shape is None: + # There is no target column in output data + output_data.supplementary_data.column_types = column_info + return output_data + + if is_regression_task or is_ts_forecasting_task: + if len(target_shape) > 1: + column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]}) + else: + # Array present "time series" + column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)}) + else: + # Classification task or clustering + column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]}) + + output_data.supplementary_data.column_types = column_info + return output_data diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index e31937c8f6..09368286c5 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -7,7 +7,7 @@ from fedot.core.data.data import InputData from fedot.core.data.data_preprocessing import find_categorical_columns -from fedot.preprocessing.data_types import NAME_CLASS_INT, FEDOT_STR_NAN +from fedot.preprocessing.data_types import TYPE_TO_ID, FEDOT_STR_NAN class BinaryCategoricalPreprocessor: @@ -91,7 +91,7 @@ def transform(self, input_data: InputData) -> InputData: # Update features types features_types = copied_data.supplementary_data.column_types['features'] for converted_column_id in self.binary_ids_to_convert: - features_types[converted_column_id] = NAME_CLASS_INT + features_types[converted_column_id] = TYPE_TO_ID[int] return copied_data def fit_transform(self, input_data: InputData) -> InputData: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 6d48c28191..e233fa88a5 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -14,10 +14,11 @@ if TYPE_CHECKING: from fedot.core.data.data import InputData -NAME_CLASS_STR = "" -NAME_CLASS_INT = "" -NAME_CLASS_FLOAT = "" -NAME_CLASS_NONE = "" +_convertable_types = (bool, float, int, str, type(None)) +_types_ids = range(len(_convertable_types)) + +TYPE_TO_ID = dict(zip(_convertable_types, _types_ids)) + FEDOT_STR_NAN = 'fedot_nan' # If unique values in the feature column is less than 13 - convert column into string type else to numerical CATEGORICAL_MAX_UNIQUE_TH = 13 @@ -120,8 +121,7 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict): if not converted_columns: return table - self.columns_to_del = [column_id for column_id, new_type_name in converted_columns.items() if - new_type_name == 'removed'] + self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id == -1] if not self.columns_to_del: # There are no columns to delete return table @@ -218,15 +218,15 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) - remained_column_types = [] - for i, col in enumerate(data.supplementary_data.column_types['features']): - if i not in self.string_columns_transformation_failed: - remained_column_types.append(col) - data.supplementary_data.column_types['features'] = remained_column_types + data.supplementary_data.column_types['features'] = [ + col_type + for col_id, col_type in enumerate(data.supplementary_data.column_types['features']) + if col_id not in self.string_columns_transformation_failed + ] def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list): # Check if columns number correct - n_rows, n_cols = table.shape + _, n_cols = table.shape if n_cols != len(column_types): # There is an incorrect types calculation self.log.warning('Columns number and types numbers do not match.') @@ -251,9 +251,9 @@ def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: :param column_info: dictionary with information about types in the column :param mixed_column_id: index of column in dataset """ - if len(column_info['types']) == 2 and NAME_CLASS_NONE in column_info['types']: + if len(column_info['types']) == 2 and TYPE_TO_ID[type(None)] in column_info['types']: # Column contain only one data type and nans - filtered_types = [x for x in column_info['types'] if x != NAME_CLASS_NONE] + filtered_types = [x for x in column_info['types'] if x != TYPE_TO_ID[type(None)]] return mixed_column, filtered_types[0] string_objects_number = column_info['str_number'] @@ -272,13 +272,13 @@ def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: mixed_column = mixed_column.astype(object) mixed_column[column_info['nan_ids']] = np.nan del column_info['nan_ids'] - return mixed_column, str(suggested_type) + return mixed_column, TYPE_TO_ID[suggested_type] except ValueError: # Cannot convert string objects into int or float (for example 'a' into int) prefix = f'Feature column with index {mixed_column_id} contains ' \ f'following data types: {column_info["types"]}.' self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.') - return None, 'removed' + return None, -1 def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int, task: Task) -> Tuple[np.ndarray, str]: @@ -291,7 +291,7 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: d try: mixed_column = mixed_column.astype(suggested_type) - return mixed_column, str(suggested_type) + return mixed_column, TYPE_TO_ID[suggested_type] except ValueError: # Cannot convert string objects into int or float (for example 'a' into int) target_column = pd.Series(mixed_column) @@ -302,7 +302,7 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: d log_message = f'{prefix} String cannot be converted into {suggested_type}. Ignore non converted values.' self.log.debug(log_message) self.target_converting_has_errors = True - return converted_column.values, str(suggested_type) + return converted_column.values, TYPE_TO_ID[suggested_type] def _into_categorical_features_transformation_for_fit(self, data: InputData): """ @@ -313,7 +313,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): for column_id in range(n_cols): # For every int/float column perform check column_type = data.supplementary_data.column_types['features'][column_id] - if 'int' in column_type or 'float' in column_type: + if column_type in [TYPE_TO_ID[int], TYPE_TO_ID[float]]: numerical_column = pd.Series(data.features[:, column_id]) # Calculate number of unique values except nans @@ -331,7 +331,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = NAME_CLASS_STR + features_types[column_id] = TYPE_TO_ID[str] def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -349,7 +349,7 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = NAME_CLASS_STR + features_types[column_id] = TYPE_TO_ID[str] def _into_numeric_features_transformation_for_fit(self, data: InputData): """ @@ -359,7 +359,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): for column_id in range(n_cols): # For every string column perform converting if necessary column_type = data.supplementary_data.column_types['features'][column_id] - if 'str' in column_type: + if column_type == TYPE_TO_ID[str]: string_column = pd.Series(data.features[:, column_id]) # Number of nans in the column @@ -382,7 +382,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): # Update information about column types (in-place) self.categorical_into_float.append(column_id) features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = NAME_CLASS_FLOAT + features_types[column_id] = TYPE_TO_ID[float] elif failed_ratio >= self.acceptable_failed_rate_top \ and is_column_contain_numerical_objects: # The column consists mostly of truly str values and has a few ints/floats in it @@ -390,7 +390,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): elif self.acceptable_failed_rate_top > failed_ratio >= self.acceptable_failed_rate_bottom: # Probably numerical column contains a lot of '?' or 'x' as nans equivalents # Add columns to remove list - self.string_columns_transformation_failed.update({column_id: 'removed'}) + self.string_columns_transformation_failed.update({column_id: -1}) def _into_numeric_features_transformation_for_predict(self, data: InputData): """ Apply conversion into float string column for every signed column """ @@ -409,7 +409,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = NAME_CLASS_FLOAT + features_types[column_id] = TYPE_TO_ID[float] def define_column_types(table: np.ndarray): @@ -417,29 +417,38 @@ def define_column_types(table: np.ndarray): types, which column contains. If column with mixed type contain str object additional field 'str_ids' with indices of string objects is prepared """ - def to_type(item): - return str(type(item)) - vto_type = np.vectorize(to_type) - if table is None: return {} _, n_columns = table.shape + + nans = pd.isna(table) + table_of_types = np.empty_like(table, dtype=np.int8) + table_of_types[~nans] = [ + TYPE_TO_ID[type(x.item() if getattr(x, 'item', False) else x)] + for x in table[~nans] + ] + table_of_types[nans] = TYPE_TO_ID[type(None)] + columns_info = {} for column_id in range(n_columns): - current_column = table[:, column_id] + col_types = table_of_types[:, column_id] - column_types = np.where(pd.isna(current_column), str(type(None)), vto_type(current_column)) - unique_column_types = np.unique(column_types) + unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True) - if len(unique_column_types) > 1: - str_number = (column_types == NAME_CLASS_STR).sum() - int_number = (column_types == NAME_CLASS_INT).sum() - float_number = (column_types == NAME_CLASS_FLOAT).sum() + if len(unique_col_types) > 1: + numbers = [ + unique_col_types_number[unique_col_types == TYPE_TO_ID[t]] + for t in [str, int, float] + ] + str_number, int_number, float_number = [ + number.item() if len(number) else 0 + for number in numbers + ] # Store information about nans in the target - nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE)) - columns_info.update({column_id: {'types': unique_column_types, + nan_ids = np.ravel(np.argwhere(col_types == TYPE_TO_ID[type(None)])) + columns_info.update({column_id: {'types': unique_col_types, 'str_number': str_number, 'int_number': int_number, 'float_number': float_number, @@ -447,7 +456,7 @@ def to_type(item): 'nan_ids': nan_ids}}) else: # There is only one type, or several types such as int and float - columns_info.update({column_id: {'types': unique_column_types}}) + columns_info.update({column_id: {'types': unique_col_types}}) return columns_info @@ -469,11 +478,11 @@ def apply_type_transformation(table: np.ndarray, column_types: list, log: Logger during fit """ - def type_by_name(current_type_name: str): - """ Return type by its name """ - if 'int' in current_type_name: + def type_by_id(current_type_id: int): + """ Return type by its ID """ + if current_type_id == TYPE_TO_ID[int]: return int - elif 'str' in current_type_name: + elif current_type_id == TYPE_TO_ID[str]: return str else: return float @@ -485,7 +494,7 @@ def type_by_name(current_type_name: str): n_rows, n_cols = table.shape for column_id in range(n_cols): current_column = table[:, column_id] - current_type = type_by_name(column_types[column_id]) + current_type = type_by_id(column_types[column_id]) _convert_predict_column_into_desired_type(table=table, current_column=current_column, current_type=current_type, column_id=column_id, log=log) @@ -500,7 +509,7 @@ def convert_num_column_into_string_array(numerical_column: pd.Series) -> np.arra return numerical_column.to_numpy() -def _obtain_new_column_type(column_info): +def _obtain_new_column_type(column_info: dict): """ Suggest in or float type based on the presence of nan and float values """ if column_info['float_number'] > 0 or column_info['nan_number'] > 0: # Even if one of types are float - all elements should be converted into float @@ -534,24 +543,24 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) """ updated_column_types = [] for column_id, column_info in columns_types_info.items(): - column_types = column_info['types'] + column_type_ids = column_info['types'] - if len(column_types) == 1: + if len(column_type_ids) == 1: # Column initially contain only one type - updated_column_types.append(column_types[0]) - elif len(column_types) == 2 and NAME_CLASS_NONE in column_types: + updated_column_types.append(column_type_ids[0]) + elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids: # Column with one type and nans - filtered_types = [x for x in column_types if x != NAME_CLASS_NONE] + filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]] updated_column_types.append(filtered_types[0]) else: - if any('str' in column_type_name for column_type_name in column_types): + if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_type_ids): # Mixed-types column with string new_column_type = converted_columns[column_id] - if new_column_type != 'removed': + if new_column_type != -1: updated_column_types.append(new_column_type) else: # Mixed-types with float and integer - updated_column_types.append(NAME_CLASS_FLOAT) + updated_column_types.append(TYPE_TO_ID[float]) return updated_column_types diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index d541f94f98..bcbe969a0f 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -29,7 +29,7 @@ from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor from fedot.preprocessing.data_type_check import exclude_ts, exclude_multi_ts, exclude_image -from fedot.preprocessing.data_types import NAME_CLASS_INT, TableTypesCorrector +from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer # The allowed percent of empty samples in features. @@ -357,6 +357,7 @@ def _clean_extra_spaces(data: InputData) -> InputData: Returns: cleaned ``data`` """ + def strip_all_strs(item: Union[object, str]): try: return item.strip() @@ -472,8 +473,8 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra encoded_target = data.target if encoder is not None: # Target encoders have already been fitted - data.supplementary_data.column_types['target'] = [NAME_CLASS_INT] - encoded_target = encoder.transform(data.target) + data.supplementary_data.column_types['target'] = [TYPE_TO_ID[int]] + encoded_target = encoder.transform(encoded_target) if len(encoded_target.shape) == 1: encoded_target = encoded_target.reshape((-1, 1)) return encoded_target diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py index 1074aff26a..5d5581139e 100644 --- a/test/unit/data/test_supplementary_data.py +++ b/test/unit/data/test_supplementary_data.py @@ -9,8 +9,8 @@ from fedot.core.pipelines.pipeline import Pipeline from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.preprocessing.data_types import TYPE_TO_ID from test.unit.tasks.test_regression import get_synthetic_regression_data -from test.unit.data.test_data_merge import unequal_outputs_table @pytest.fixture() @@ -19,15 +19,15 @@ def outputs_table_with_different_types(): task = Task(TaskTypesEnum.regression) idx = [0, 1, 2] target = [1, 2, 10] - data_info_first = SupplementaryData(column_types={'features': ["", ""], - 'target': [""]}) + data_info_first = SupplementaryData(column_types={'features': [TYPE_TO_ID[str], TYPE_TO_ID[float]], + 'target': [TYPE_TO_ID[int]]}) output_first = OutputData(idx=idx, features=None, predict=np.array([['a', 1.1], ['b', 2], ['c', 3]], dtype=object), task=task, target=target, data_type=DataTypesEnum.table, supplementary_data=data_info_first) - data_info_second = SupplementaryData(column_types={'features': [""], - 'target': [""]}) + data_info_second = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]], + 'target': [TYPE_TO_ID[int]]}) output_second = OutputData(idx=idx, features=None, predict=np.array([[2.5], [2.1], [9.3]], dtype=float), task=task, target=target, data_type=DataTypesEnum.table, @@ -124,4 +124,4 @@ def test_define_types_after_merging(outputs_table_with_different_types): ancestor_target_type = outputs[0].supplementary_data.column_types['target'][0] assert target_types[0] == ancestor_target_type assert len(features_types) == 3 - assert tuple(features_types) == ("", "", "") + assert tuple(features_types) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float]) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index 9cdb67fe00..8910f154c0 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -21,8 +21,7 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.operation_types_repository import OperationTypesRepository from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams -from fedot.preprocessing.data_types import NAME_CLASS_FLOAT, NAME_CLASS_INT, \ - NAME_CLASS_STR +from fedot.preprocessing.data_types import TYPE_TO_ID from test.unit.preprocessing.test_preprocessing_through_api import data_with_only_categorical_features @@ -130,7 +129,7 @@ def get_multivariate_time_series(mutli_ts=False): def get_nan_inf_data(): - supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_FLOAT] * 4}) + supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]] * 4}) train_input = InputData(idx=[0, 1, 2, 3], features=np.array([[1, 2, 3, 4], [2, np.nan, 4, 5], @@ -145,8 +144,8 @@ def get_nan_inf_data(): def get_single_feature_data(task=None): - supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_INT], - 'target': [NAME_CLASS_INT]}) + supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int]], + 'target': [TYPE_TO_ID[int]]}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=np.array([[1], [2], [3], [7], [8], [9]]), target=np.array([[0], [0], [0], [1], [1], [1]]), @@ -169,10 +168,10 @@ def get_mixed_data(task=None, extended=False): [np.nan, np.nan, '1', np.nan, '2', 'not blue', 'di'], [8, '1', '1', 0, '1', 'not blue', 'da bu'], [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object) - features_types = [NAME_CLASS_INT, NAME_CLASS_STR, NAME_CLASS_STR, NAME_CLASS_INT, - NAME_CLASS_STR, NAME_CLASS_STR, NAME_CLASS_STR] + features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], + TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]] supp_data = SupplementaryData(column_types={'features': features_types, - 'target': [NAME_CLASS_INT]}) + 'target': [TYPE_TO_ID[int]]}) else: features = np.array([[1, '0', 1], [2, '1', 0], @@ -180,9 +179,9 @@ def get_mixed_data(task=None, extended=False): [7, '1', 1], [8, '1', 1], [9, '0', 0]], dtype=object) - features_types = [NAME_CLASS_INT, NAME_CLASS_STR, NAME_CLASS_INT] + features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]] supp_data = SupplementaryData(column_types={'features': features_types, - 'target': [NAME_CLASS_INT]}) + 'target': [TYPE_TO_ID[int]]}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features, @@ -201,7 +200,7 @@ def get_nan_binary_data(task=None): Binary int columns must be processed as "almost categorical". Current dataset For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ - features_types = [NAME_CLASS_INT, NAME_CLASS_STR, NAME_CLASS_INT] + features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]] supp_data = SupplementaryData(column_types={'features': features_types}) features = np.array([[1, '0', 0], [np.nan, np.nan, np.nan], @@ -232,8 +231,8 @@ def get_unbalanced_dataset(size=10, disbalance=0.4, target_dim=None): target = target.reshape(-1, 1) supp_data = SupplementaryData(column_types={ - 'features': [NAME_CLASS_INT, NAME_CLASS_STR], - 'target': [NAME_CLASS_INT] + 'features': [TYPE_TO_ID[int], TYPE_TO_ID[str]], + 'target': [TYPE_TO_ID[int]] }) input_data = InputData(idx=np.arange(features.shape[0]), @@ -252,7 +251,7 @@ def data_with_binary_int_features_and_equal_categories(): must be processed as "almost categorical". Current dataset For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ - supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_INT, NAME_CLASS_INT]}) + supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int], TYPE_TO_ID[int]]}) task = Task(TaskTypesEnum.classification) features = np.array([[1, 10], [np.nan, np.nan], diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py index c1bbed9592..3b0e60fc25 100644 --- a/test/unit/preprocessing/test_preprocessing_through_api.py +++ b/test/unit/preprocessing/test_preprocessing_through_api.py @@ -6,12 +6,12 @@ from fedot.core.data.supplementary_data import SupplementaryData from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum -from fedot.preprocessing.data_types import NAME_CLASS_STR +from fedot.preprocessing.data_types import TYPE_TO_ID def data_with_only_categorical_features(): """ Generate tabular data with only categorical features. All of them are binary. """ - supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_STR] * 3}) + supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[str]] * 3}) task = Task(TaskTypesEnum.regression) features = np.array([["'a'", "0", "1"], ["'b'", "1", "0"], diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 038b9f44af..81fa8bb74b 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -9,6 +9,7 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum, Task from fedot.core.utils import fedot_project_root +from fedot.preprocessing.data_types import TYPE_TO_ID from fedot.preprocessing.data_types import TableTypesCorrector, apply_type_transformation from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME from test.unit.preprocessing.test_pipeline_preprocessing import data_with_mixed_types_in_each_column, \ @@ -133,10 +134,10 @@ def test_column_types_converting_correctly(): assert len(features_types) == 4 assert len(target_types) == 2 - assert features_types[0] == "" - assert features_types[1] == "" - assert features_types[2] == "" - assert target_types[0] == target_types[0] == "" + assert features_types[0] == TYPE_TO_ID[str] + assert features_types[1] == TYPE_TO_ID[str] + assert features_types[2] == TYPE_TO_ID[str] + assert target_types[0] == target_types[1] == TYPE_TO_ID[str] def test_column_types_process_correctly(): @@ -158,7 +159,7 @@ def test_column_types_process_correctly(): features_columns = predicted.supplementary_data.column_types['features'] assert len(features_columns) == predicted.predict.shape[1] # All output values are float - assert all('float' in str(feature_type) for feature_type in features_columns) + assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_columns) def test_complicated_table_types_processed_correctly(): @@ -263,7 +264,7 @@ def test_str_numbers_with_dots_and_commas_in_predict(): input_data = InputData(idx=np.arange(4), features=features, target=target, task=task, data_type=DataTypesEnum.table) - transformed_predict = apply_type_transformation(table=input_data.features, column_types=['int'], + transformed_predict = apply_type_transformation(table=input_data.features, column_types=[TYPE_TO_ID[int]], log=default_log('test_str_numbers_with_dots_and_commas_in_predict')) assert all(transformed_predict == np.array([[8], [4], [3], [6]])) From d2d0f9e74eac7bafc6572b49c961e140d404c3a0 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 23 Jan 2023 16:38:07 +0300 Subject: [PATCH 16/72] PR fixes --- fedot/core/operations/model.py | 138 +++++++++++++++--------------- fedot/preprocessing/data_types.py | 4 +- 2 files changed, 71 insertions(+), 71 deletions(-) diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py index 8ecd309fd8..1499c05a47 100644 --- a/fedot/core/operations/model.py +++ b/fedot/core/operations/model.py @@ -1,69 +1,69 @@ -import numpy as np - -from fedot.core.data.data import OutputData -from fedot.core.operations.operation import Operation -from fedot.core.repository.dataset_types import DataTypesEnum -from fedot.core.repository.operation_types_repository import OperationTypesRepository -from fedot.core.repository.tasks import TaskTypesEnum -from fedot.preprocessing.data_types import TYPE_TO_ID - - -class Model(Operation): - """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task - - Args: - operation_type: name of the model - """ - - def __init__(self, operation_type: str): - super().__init__(operation_type=operation_type) - self.operations_repo = OperationTypesRepository('model') - - @staticmethod - def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData: - """Assign types for tabular data obtained from model predictions.\n - By default, all types of model predictions for tabular data can be clearly defined - """ - if output_data.data_type is not DataTypesEnum.table: - # No column data types info for non-tabular data - return output_data - - is_regression_task = output_data.task.task_type is TaskTypesEnum.regression - is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting - - predict_shape = np.array(output_data.predict).shape - # Add information about features - if is_regression_task or is_ts_forecasting_task: - if len(predict_shape) < 2: - column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} - else: - column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} - else: - if len(predict_shape) < 2: - output_data.predict = output_data.predict.reshape((-1, 1)) - predict_shape = output_data.predict.shape - # Classification task or clustering - if output_mode == 'labels': - column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]} - else: - column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} - - # Add information about target - target_shape = output_data.target.shape if output_data.target is not None else None - if target_shape is None: - # There is no target column in output data - output_data.supplementary_data.column_types = column_info - return output_data - - if is_regression_task or is_ts_forecasting_task: - if len(target_shape) > 1: - column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]}) - else: - # Array present "time series" - column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)}) - else: - # Classification task or clustering - column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]}) - - output_data.supplementary_data.column_types = column_info - return output_data +import numpy as np + +from fedot.core.data.data import OutputData +from fedot.core.operations.operation import Operation +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_types_repository import OperationTypesRepository +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.preprocessing.data_types import TYPE_TO_ID + + +class Model(Operation): + """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task + + Args: + operation_type: name of the model + """ + + def __init__(self, operation_type: str): + super().__init__(operation_type=operation_type) + self.operations_repo = OperationTypesRepository('model') + + @staticmethod + def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData: + """Assign types for tabular data obtained from model predictions.\n + By default, all types of model predictions for tabular data can be clearly defined + """ + if output_data.data_type is not DataTypesEnum.table: + # No column data types info for non-tabular data + return output_data + + is_regression_task = output_data.task.task_type is TaskTypesEnum.regression + is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting + + predict_shape = np.array(output_data.predict).shape + # Add information about features + if is_regression_task or is_ts_forecasting_task: + if len(predict_shape) < 2: + column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} + else: + column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + else: + if len(predict_shape) < 2: + output_data.predict = output_data.predict.reshape((-1, 1)) + predict_shape = output_data.predict.shape + # Classification task or clustering + if output_mode == 'labels': + column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]} + else: + column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + + # Add information about target + target_shape = output_data.target.shape if output_data.target is not None else None + if target_shape is None: + # There is no target column in output data + output_data.supplementary_data.column_types = column_info + return output_data + + if is_regression_task or is_ts_forecasting_task: + if len(target_shape) > 1: + column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]}) + else: + # Array present "time series" + column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)}) + else: + # Classification task or clustering + column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]}) + + output_data.supplementary_data.column_types = column_info + return output_data diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index e233fa88a5..cdf6ecf708 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -425,7 +425,7 @@ def define_column_types(table: np.ndarray): nans = pd.isna(table) table_of_types = np.empty_like(table, dtype=np.int8) table_of_types[~nans] = [ - TYPE_TO_ID[type(x.item() if getattr(x, 'item', False) else x)] + TYPE_TO_ID[type(x.item() if isinstance(x, (np.ndarray, np.generic)) else x)] for x in table[~nans] ] table_of_types[nans] = TYPE_TO_ID[type(None)] @@ -447,7 +447,7 @@ def define_column_types(table: np.ndarray): ] # Store information about nans in the target - nan_ids = np.ravel(np.argwhere(col_types == TYPE_TO_ID[type(None)])) + nan_ids = np.where(nans[:, column_id])[0] columns_info.update({column_id: {'types': unique_col_types, 'str_number': str_number, 'int_number': int_number, From 466a3eaf7b790abe68190100fa1747ab4ecc81d4 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 9 Feb 2023 15:01:22 +0300 Subject: [PATCH 17/72] style fixes --- fedot/core/data/data_preprocessing.py | 3 +-- .../data_operations/categorical_encoders.py | 6 +++--- fedot/core/operations/model.py | 6 ++---- fedot/preprocessing/data_types.py | 18 ++++++++---------- test/unit/preprocessing/test_preprocessors.py | 6 +++--- 5 files changed, 17 insertions(+), 22 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 9f4455d87b..92960e5283 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -30,8 +30,7 @@ def convert_into_column(array: np.ndarray) -> np.ndarray: """ Perform conversion for data if it is necessary """ if len(array.shape) == 1: return array.reshape(-1, 1) - else: - return array + return array def divide_data_categorical_numerical(input_data: InputData, categorical_ids: list, diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 58ee267622..82c3767dad 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -36,9 +36,9 @@ def fit(self, input_data: InputData): :return encoder: trained encoder (optional output) """ features = input_data.features - features_type_ids = input_data.supplementary_data.column_types.get('features') + features_types_ids = input_data.supplementary_data.column_types.get('features') categorical_ids, non_categorical_ids = find_categorical_columns(features, - features_type_ids) + features_types_ids) # Indices of columns with categorical and non-categorical features self.categorical_ids = categorical_ids @@ -84,7 +84,7 @@ def _update_column_types(self, output_data: OutputData): # Calculate new binary columns number after encoding encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns) - numerical_columns.extend([TYPE_TO_ID[int]] * encoded_columns_number) + numerical_columns += [TYPE_TO_ID[int]] * encoded_columns_number output_data.encoded_idx = self.encoded_ids output_data.supplementary_data.column_types['features'] = numerical_columns diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py index 1499c05a47..9e38f5e27c 100644 --- a/fedot/core/operations/model.py +++ b/fedot/core/operations/model.py @@ -43,10 +43,8 @@ def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> Ou output_data.predict = output_data.predict.reshape((-1, 1)) predict_shape = output_data.predict.shape # Classification task or clustering - if output_mode == 'labels': - column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]} - else: - column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + target_type = int if output_mode == 'labels' else float + column_info = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]} # Add information about target target_shape = output_data.target.shape if output_data.target is not None else None diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index cdf6ecf708..1082ea43e9 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,6 +1,7 @@ from __future__ import annotations from copy import copy +from typing import TYPE_CHECKING, Tuple import numpy as np import pandas as pd @@ -8,13 +9,10 @@ from fedot.core.repository.tasks import Task, TaskTypesEnum -NoneType = type(None) -from typing import TYPE_CHECKING, Tuple - if TYPE_CHECKING: from fedot.core.data.data import InputData -_convertable_types = (bool, float, int, str, type(None)) +_convertable_types = (bool, float, int, str, type(None)) # preserve lexicographical order _types_ids = range(len(_convertable_types)) TYPE_TO_ID = dict(zip(_convertable_types, _types_ids)) @@ -543,17 +541,17 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) """ updated_column_types = [] for column_id, column_info in columns_types_info.items(): - column_type_ids = column_info['types'] + column_types_ids = column_info['types'] - if len(column_type_ids) == 1: + if len(column_types_ids) == 1: # Column initially contain only one type - updated_column_types.append(column_type_ids[0]) - elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids: + updated_column_types.append(column_types_ids[0]) + elif len(column_types_ids) == 2 and TYPE_TO_ID[type(None)] in column_types_ids: # Column with one type and nans - filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]] + filtered_types = [x for x in column_types_ids if x != TYPE_TO_ID[type(None)]] updated_column_types.append(filtered_types[0]) else: - if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_type_ids): + if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_types_ids): # Mixed-types column with string new_column_type = converted_columns[column_id] if new_column_type != -1: diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 81fa8bb74b..562e821a3e 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -156,10 +156,10 @@ def test_column_types_process_correctly(): pipeline.fit(train_data) predicted = pipeline.predict(test_data) - features_columns = predicted.supplementary_data.column_types['features'] - assert len(features_columns) == predicted.predict.shape[1] + features_types_ids = predicted.supplementary_data.column_types['features'] + assert len(features_types_ids) == predicted.predict.shape[1] # All output values are float - assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_columns) + assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_types_ids) def test_complicated_table_types_processed_correctly(): From 39785144636057cb6dbd3dc29eefc5290c12b2e4 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 9 Feb 2023 15:03:29 +0300 Subject: [PATCH 18/72] array creation via multiplication fix --- .../data_operations/sklearn_transformations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 3b7e4b49b3..47f77ba9e0 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -89,7 +89,7 @@ def update_column_types(output_data: OutputData) -> OutputData: """ _, n_cols = output_data.predict.shape - output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float] * n_cols] + output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float]] * n_cols return output_data @@ -197,7 +197,7 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): if cols_number_added > 0: # There are new columns in the table col_types = output_data.supplementary_data.column_types['features'] - col_types.extend([TYPE_TO_ID[float]] * cols_number_added) + col_types += [TYPE_TO_ID[float]] * cols_number_added output_data.supplementary_data.column_types['features'] = col_types From f5e1589b2974409e4a544aaf565bb0e47166bd97 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 15 Feb 2023 13:30:56 +0300 Subject: [PATCH 19/72] unified unimodal methods --- fedot/preprocessing/base_preprocessing.py | 4 +- fedot/preprocessing/preprocessing.py | 105 +++++++--------------- 2 files changed, 31 insertions(+), 78 deletions(-) diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index 106e263501..a3244559c7 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -33,10 +33,8 @@ def __init__(self): self.features_encoders: Dict[str, Union[OneHotEncodingImplementation, LabelEncodingImplementation]] = {} self.use_label_encoder: bool = False self.features_imputers: Dict[str, ImputationImplementation] = {} - self.ids_relevant_features: Dict[str, List[int]] = {} + self.ids_relevant_features: Dict[str, np.ndarray] = {} - # Cannot be processed due to incorrect types or large number of nans - self.ids_incorrect_features: Dict[str, List[int]] = {} # Categorical preprocessor for binary categorical features self.binary_categorical_processors: Dict[str, BinaryCategoricalPreprocessor] = {} self.types_correctors: Dict[str, TableTypesCorrector] = {} diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index bcbe969a0f..04bcf106a5 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -107,13 +107,12 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) -> self._init_supplementary_preprocessors(data) if isinstance(data, InputData): - data = self._prepare_obligatory_unimodal_for_fit(data, source_name=DEFAULT_SOURCE_NAME) + data = self._prepare_obligatory_unimodal(data, source_name=DEFAULT_SOURCE_NAME) elif isinstance(data, MultiModalData): self._init_main_target_source_name(data) for data_source_name, values in data.items(): - data[data_source_name] = self._prepare_obligatory_unimodal_for_fit(values, - source_name=data_source_name) + data[data_source_name] = self._prepare_obligatory_unimodal(values, source_name=data_source_name) BasePreprocessor.mark_as_preprocessed(data) return data @@ -122,12 +121,12 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) -> def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]: if isinstance(data, InputData): - data = self._prepare_obligatory_unimodal_for_predict(data, source_name=DEFAULT_SOURCE_NAME) + data = self._prepare_obligatory_unimodal(data, source_name=DEFAULT_SOURCE_NAME, is_fit_stage=False) elif isinstance(data, MultiModalData): for data_source_name, values in data.items(): - data[data_source_name] = self._prepare_obligatory_unimodal_for_predict(values, - source_name=data_source_name) + data[data_source_name] = self._prepare_obligatory_unimodal(values, source_name=data_source_name, + is_fit_stage=False) BasePreprocessor.mark_as_preprocessed(data) return data @@ -170,13 +169,14 @@ def _take_only_correct_features(self, data: InputData, source_name: str): source_name: name of the data source node """ current_relevant_ids = self.ids_relevant_features[source_name] - if current_relevant_ids: + if len(current_relevant_ids): data.features = data.features[:, current_relevant_ids] @exclude_ts @exclude_multi_ts @exclude_image - def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str) -> InputData: + def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, + *, is_fit_stage: bool = True) -> InputData: """ Processes InputData for pipeline fit method @@ -204,19 +204,22 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str replace_inf_with_nans(data) # Find incorrect features which must be removed - self._find_features_full_of_nans(data, source_name) + if is_fit_stage: + self._find_features_lacking_nans(data, source_name) self._take_only_correct_features(data, source_name) - data = self._drop_rows_with_nan_in_target(data) - - # Column types processing - launch after correct features selection - self.types_correctors[source_name].convert_data_for_fit(data) - if self.types_correctors[source_name].target_converting_has_errors: + if is_fit_stage: data = self._drop_rows_with_nan_in_target(data) - # Train Label Encoder for categorical target if necessary and apply it - if source_name not in self.target_encoders: - self._train_target_encoder(data, source_name) - data.target = self._apply_target_encoding(data, source_name) + # Column types processing - launch after correct features selection + self.types_correctors[source_name].convert_data_for_fit(data) + if self.types_correctors[source_name].target_converting_has_errors: + data = self._drop_rows_with_nan_in_target(data) + # Train Label Encoder for categorical target if necessary and apply it + if source_name not in self.target_encoders: + self._train_target_encoder(data, source_name) + data.target = self._apply_target_encoding(data, source_name) + else: + self.types_correctors[source_name].convert_data_for_predict(data) # TODO andreygetmanov target encoding must be obligatory for all data types if data_type_is_text(data): @@ -225,49 +228,10 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str elif data_type_is_table(data): data = self._clean_extra_spaces(data) # Process binary categorical features - data = self.binary_categorical_processors[source_name].fit_transform(data) - - return data - - @exclude_ts - @exclude_multi_ts - @exclude_image - def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: str) -> InputData: - """ - Processes InputData for pipeline predict method - - Args: - data: to be preprocessed - source_name: name of the data source node - - Returns: - obligatory-prepared data - """ - if data.supplementary_data.obligatorily_preprocessed: - # Preprocessing was already done - return data - return data - - # Convert datetime data to numerical - data.features = np_datetime_to_numeric(data.features) - if data.target is not None: - data.target = np_datetime_to_numeric(data.target) - - # Wrap indices in numpy array - data.idx = np.array(data.idx) - - # Fix tables / time series sizes - data = self._correct_shapes(data) - replace_inf_with_nans(data) - - # Perform preprocessing for types - launch after correct features selection - self._take_only_correct_features(data, source_name) - self.types_correctors[source_name].convert_data_for_predict(data) - - if data_type_is_text(data): - replace_nans_with_empty_strings(data) - if data_type_is_table(data): - data = self._clean_extra_spaces(data) - data = self.binary_categorical_processors[source_name].transform(data) + if is_fit_stage: + data = self.binary_categorical_processors[source_name].fit_transform(data) + else: + data = self.binary_categorical_processors[source_name].transform(data) return data @@ -294,27 +258,18 @@ def _prepare_optional(self, pipeline, data: InputData, source_name: str): if not has_tag: data = action_if_no_tag(data, source_name) - def _find_features_full_of_nans(self, data: InputData, source_name: str): + def _find_features_lacking_nans(self, data: InputData, source_name: str): """ - Finds features with more than ALLOWED_NAN_PERCENT of nan's + Finds features with less than ALLOWED_NAN_PERCENT of nan's Args: data: data to find columns with nan values source_name: name of the data source node """ - # Initialize empty lists to fill it with indices - self.ids_relevant_features[source_name] = [] - self.ids_incorrect_features[source_name] = [] - features = data.features - n_samples, n_columns = features.shape - - for i in range(n_columns): - feature = features[:, i] - if np.sum(pd.isna(feature)) / n_samples < ALLOWED_NAN_PERCENT: - self.ids_relevant_features[source_name].append(i) - else: - self.ids_incorrect_features[source_name].append(i) + axes_except_cols = (0,) + tuple(range(2, features.ndim)) + are_allowed = np.mean(pd.isna(features), axis=axes_except_cols) < ALLOWED_NAN_PERCENT + self.ids_relevant_features[source_name] = np.nonzero(are_allowed)[0] @staticmethod def _drop_rows_with_nan_in_target(data: InputData) -> InputData: From 806acd1f83a14727ea3e255944ffbb694bd5423e Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 15 Feb 2023 13:45:16 +0300 Subject: [PATCH 20/72] +OperationTypesRepository type in operation.py --- fedot/core/operations/operation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py index 422aae58ce..3d4ae4134d 100644 --- a/fedot/core/operations/operation.py +++ b/fedot/core/operations/operation.py @@ -8,6 +8,7 @@ from fedot.core.operations.hyperparameters_preprocessing import HyperparametersPreprocessor from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.operation_types_repository import OperationMetaInfo +from fedot.core.repository.operation_types_repository import OperationTypesRepository from fedot.core.repository.tasks import Task, TaskTypesEnum, compatible_task_types from fedot.utilities.custom_errors import AbstractMethodNotImplementError @@ -25,7 +26,7 @@ def __init__(self, operation_type: str, **kwargs): self.operation_type = operation_type self._eval_strategy = None - self.operations_repo = None + self.operations_repo: OperationTypesRepository = None self.fitted_operation = None self.log = default_log(self) @@ -163,7 +164,7 @@ def to_json(self) -> Dict[str, Any]: def _eval_strategy_for_task(operation_type: str, current_task_type: TaskTypesEnum, - operations_repo): + operations_repo: OperationTypesRepository): """The function returns the strategy for the selected operation and task type. And if it is necessary, found acceptable strategy for operation From d6dd5a9c898ed70cbf5b5afb1658457781a1eb4a Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 15 Feb 2023 13:47:24 +0300 Subject: [PATCH 21/72] add safer version of enum/strategies imports --- fedot/core/repository/json_evaluation.py | 79 ++++++++++++------- .../repository/operation_types_repository.py | 46 ++++++----- test/integration/models/test_repository.py | 8 +- 3 files changed, 79 insertions(+), 54 deletions(-) diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py index 82948c3849..3ab0a96e93 100644 --- a/fedot/core/repository/json_evaluation.py +++ b/fedot/core/repository/json_evaluation.py @@ -1,37 +1,58 @@ -from typing import Union +from importlib import import_module +from typing import Union, TYPE_CHECKING, List # imports are required for the eval -from fedot.core.repository.dataset_types import * -from fedot.core.repository.tasks import * +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum + +if TYPE_CHECKING: + from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy def read_field(source: dict, field_name: str, default: list): - """ Function for reading field in the dictionary + """ + Function for reading field in the dictionary + + Args: + source: dictionary with information + field_name: name of the looked up field in the ``source`` + default: default list if ``field_name`` is not in the source dict keys + + Returns: + list with field values + """ + field_value = source.get(field_name, default) + if isinstance(field_value, str): + return import_enums_from_str(field_value) + return field_value + + +def import_enums_from_str(field_value: str) -> Union[List[DataTypesEnum], + List[TaskTypesEnum]]: + """ + Imports enums by theirs string name representation and returns list of theirs values + + Args: + field_value: str representing list of + either class:`DataTypesEnum` or class:`TaskTypesEnum` values + + Returns: + list of either class:`DataTypesEnum` or class:`TaskTypesEnum` values + """ + enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val != ''] + return [ + getattr(globals()[data_type], value) + for (data_type, value) in enums] + + +def import_strategy_from_str(field_value: List[str]) -> 'EvaluationStrategy': + """ + Imports evaluation strategy module and returns its particular type - :param source: dictionary with information - :param field_name: name of the field for searching for in it - :param default: default list if field_name is not in the source dict keys + Args: + field_value: list of [namespace, type_name] - :return : list with field values + Returns: + specific evaluation strategy """ - if field_name in source.keys(): - field_value = source[field_name] - if isinstance(field_value, str): - return eval_field_str(field_value) - else: - return field_value - else: - return default - - -def eval_field_str(field_value) -> Union[List[DataTypesEnum], - List[TaskTypesEnum]]: - # TODO add docstring - return eval(field_value) - - -def eval_strategy_str(field_value): - # TODO add docstring - namespace = field_value[0] - exec(f'from {namespace} import {field_value[1]}') - return eval(field_value[1]) + return getattr(import_module(field_value[0]), field_value[1]) diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py index 2228c32563..d300f10f15 100644 --- a/fedot/core/repository/operation_types_repository.py +++ b/fedot/core/repository/operation_types_repository.py @@ -2,7 +2,7 @@ import os from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Dict, List, Optional, Union, TYPE_CHECKING import numpy as np from golem.core.log import default_log @@ -10,9 +10,12 @@ from fedot.core.constants import BEST_QUALITY_PRESET_NAME, AUTO_PRESET_NAME from fedot.core.repository.dataset_types import DataTypesEnum -from fedot.core.repository.json_evaluation import eval_field_str, eval_strategy_str, read_field +from fedot.core.repository.json_evaluation import import_enums_from_str, import_strategy_from_str, read_field from fedot.core.repository.tasks import Task, TaskTypesEnum +if TYPE_CHECKING: + from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy + AVAILABLE_REPO_NAMES = ['all', 'model', 'data_operation', 'automl'] @@ -22,14 +25,14 @@ class OperationMetaInfo: input_types: List[DataTypesEnum] output_types: List[DataTypesEnum] task_type: List[TaskTypesEnum] - supported_strategies: Any + supported_strategies: Union['EvaluationStrategy', Dict[str, 'EvaluationStrategy']] allowed_positions: List[str] tags: Optional[List[str]] = None presets: Optional[List[str]] = None - def current_strategy(self, task: TaskTypesEnum): - """Method allows getting available processing strategies depending on the - selected task + def current_strategy(self, task: TaskTypesEnum) -> Optional['EvaluationStrategy']: + """ + Gets available processing strategies depending on the selected task Args: task: machine learning task (e.g. regression and classification) @@ -176,13 +179,9 @@ def _initialise_repo(cls, repo_path: str) -> List[OperationMetaInfo]: properties = operations_json.get(current_operation_key) metadata = metadata_json[properties['meta']] - task_types = eval_field_str(metadata['tasks']) - input_type = eval_field_str(properties['input_type']) \ - if ('input_type' in properties) \ - else eval_field_str(metadata['input_type']) - output_type = eval_field_str(properties['output_type']) \ - if ('output_type' in properties) \ - else eval_field_str(metadata['output_type']) + task_types = import_enums_from_str(metadata['tasks']) + input_type = import_enums_from_str(properties.get('input_type', metadata.get('input_type'))) + output_type = import_enums_from_str(properties.get('output_type', metadata.get('output_type'))) # Get available strategies for obtained metadata supported_strategies = OperationTypesRepository.get_strategies_by_metadata(metadata) @@ -219,24 +218,29 @@ def _initialise_repo(cls, repo_path: str) -> List[OperationMetaInfo]: return operations_list @staticmethod - def get_strategies_by_metadata(metadata: dict): - """Method allow obtain strategy instance by the metadata + def get_strategies_by_metadata(metadata: dict) -> Union['EvaluationStrategy', Dict[str, 'EvaluationStrategy']]: + """ + Obtains strategy instance by the metadata Args: metadata: information about meta of the operation - supported_strategies: available strategies for current metadata + + Returns: + available strategies for current metadata """ strategies_json = metadata['strategies'] if isinstance(strategies_json, list): - supported_strategies = eval_strategy_str(strategies_json) - else: + supported_strategies = import_strategy_from_str(strategies_json) + elif isinstance(strategies_json, dict): supported_strategies = {} - for strategy_dict_key in strategies_json.keys(): + for strategy_dct_key, strategy_str_value in strategies_json.items(): # Convert string into class path for import - import_path = eval_field_str(strategy_dict_key) - strategy_class = eval_strategy_str(strategies_json[strategy_dict_key]) + import_path = import_enums_from_str(strategy_dct_key) + strategy_class = import_strategy_from_str(strategy_str_value) supported_strategies.update({import_path: strategy_class}) + else: + raise TypeError('strategies are of unknown type') return supported_strategies def operation_info_by_id(self, operation_id: str) -> Optional[OperationMetaInfo]: diff --git a/test/integration/models/test_repository.py b/test/integration/models/test_repository.py index 418134e23f..39eee3afd1 100644 --- a/test/integration/models/test_repository.py +++ b/test/integration/models/test_repository.py @@ -2,8 +2,8 @@ import os from fedot.core.operations.evaluation.classification import SkLearnClassificationStrategy -from fedot.core.repository.json_evaluation import eval_field_str, \ - eval_strategy_str, read_field +from fedot.core.repository.json_evaluation import import_enums_from_str, \ + import_strategy_from_str, read_field from fedot.core.repository.operation_types_repository import (OperationTypesRepository, get_operation_type_from_id) from fedot.core.repository.pipeline_operation_repository import PipelineOperationRepository @@ -48,7 +48,7 @@ def test_search_in_repository_by_tag_correct(): def test_eval_field_str(): model_metadata = _model_metadata_example(mocked_path()) - task_types = eval_field_str(model_metadata['tasks']) + task_types = import_enums_from_str(model_metadata['tasks']) assert len(task_types) == 1 assert task_types[0] == TaskTypesEnum.classification @@ -59,7 +59,7 @@ def test_eval_strategy_str(): strategies_json = model_metadata['strategies'] - strategy = eval_strategy_str(strategies_json) + strategy = import_strategy_from_str(strategies_json) assert strategy is SkLearnClassificationStrategy From b6ecf9a055650a94f7205b94f9bfe3c67254b15d Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Fri, 17 Feb 2023 14:29:49 +0300 Subject: [PATCH 22/72] optimizations and style fixes --- fedot/core/data/data_preprocessing.py | 6 +-- .../data_operations/categorical_encoders.py | 14 +++--- fedot/core/repository/json_evaluation.py | 4 +- fedot/preprocessing/categorical.py | 50 ++++++------------- fedot/preprocessing/data_types.py | 10 ++-- 5 files changed, 31 insertions(+), 53 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 92960e5283..10108e39bd 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: def replace_inf_with_nans(input_data: InputData): features = input_data.features - has_infs = (features == np.inf) | (features == -np.inf) - if np.any(has_infs): - features[has_infs] = np.nan + inf_idxs: Tuple[np.ndarray, ...] = ((features == np.inf) | (features == -np.inf)).nonzero() + if len(inf_idxs[0]): + features[inf_idxs] = np.nan def replace_nans_with_empty_strings(input_data: InputData): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 82c3767dad..27607c00f2 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Optional +from typing import Optional, Tuple import numpy as np import pandas as pd @@ -143,9 +143,9 @@ def transform(self, input_data: InputData) -> OutputData: # If categorical features are exists - transform them inplace in InputData for categorical_id in self.categorical_ids: categorical_column = input_data.features[:, categorical_id] - has_nan: np.ndarray = pd.isna(categorical_column) + nan_idxs: Tuple[np.ndarray, ...] = pd.isna(categorical_column).nonzero() - transformed = self._apply_label_encoder(categorical_column, categorical_id, has_nan) + transformed = self._apply_label_encoder(categorical_column, categorical_id, nan_idxs) copied_data.features[:, categorical_id] = transformed output_data = self._convert_to_output(copied_data, @@ -174,21 +174,21 @@ def _fit_label_encoders(self, input_data: InputData): self.encoders.update({categorical_id: le}) def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int, - has_nan: np.ndarray) -> np.ndarray: + nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray: """ Apply fitted LabelEncoder for column transformation :param categorical_column: numpy array with categorical features :param categorical_id: index of current categorical column - :param has_nan: bool array of gap elements in the ``categorical_column`` + :param nan_idxs: indices of gap elements in the ``categorical_column`` """ column_encoder = self.encoders[categorical_id] column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column))) transformed_column = column_encoder.transform(categorical_column) - if len(has_nan) > 0: + if len(nan_idxs[0]): # Store np.nan values transformed_column = transformed_column.astype(object) - transformed_column[has_nan] = np.nan + transformed_column[nan_idxs] = np.nan return transformed_column diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py index 3ab0a96e93..ba4483ce0e 100644 --- a/fedot/core/repository/json_evaluation.py +++ b/fedot/core/repository/json_evaluation.py @@ -1,7 +1,7 @@ from importlib import import_module from typing import Union, TYPE_CHECKING, List -# imports are required for the eval +# imports are required beneath in the function from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum @@ -39,7 +39,7 @@ def import_enums_from_str(field_value: str) -> Union[List[DataTypesEnum], Returns: list of either class:`DataTypesEnum` or class:`TaskTypesEnum` values """ - enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val != ''] + enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val] return [ getattr(globals()[data_type], value) for (data_type, value) in enums] diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 09368286c5..fe5fa1eeaf 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -33,26 +33,23 @@ def fit(self, input_data: InputData): return self binary_ids_to_convert = [] - number_of_columns = input_data.features.shape[-1] - for column_id in range(number_of_columns): - pd_column = pd.Series(input_data.features[:, column_id], copy=True) - has_nan = pd_column.isna() - if has_nan.sum() and column_id in categorical_ids: + for column_id, column in enumerate(input_data.features.T): + pd_column = pd.Series(column, copy=True) + is_nan = pd_column.isna() + column_uniques = pd_column.unique() + if is_nan.sum() and column_id in categorical_ids: # This categorical column has nans - replaced_column, _ = replace_nans_with_fedot_nans(pd_column, has_nan) - column_uniques = replaced_column.unique() + pd_column[is_nan] = FEDOT_STR_NAN if len(column_uniques) <= 3: # There is column with binary categories and gaps self.binary_features_with_nans.append(column_id) binary_ids_to_convert.append(column_id) - self._train_encoder(replaced_column, column_id) + self._train_encoder(pd_column, column_id) else: - column_uniques = pd_column.unique() if len(column_uniques) <= 2 and column_id in categorical_ids: # Column contains binary string feature binary_ids_to_convert.append(column_id) - # Train encoder for current column self._train_encoder(pd_column, column_id) @@ -67,26 +64,15 @@ def transform(self, input_data: InputData) -> InputData: # There are no binary categorical features return input_data - converted_features = [] - number_of_columns = input_data.features.shape[-1] - for column_id in range(number_of_columns): + copied_data = deepcopy(input_data) + for column_id, column in enumerate(copied_data.features.T): if column_id in self.binary_ids_to_convert: # If column contains nans - replace them with fedot nans special string - pd_column = pd.Series(input_data.features[:, column_id]) - has_nan = pd_column.isna() - replaced_column, has_nan = replace_nans_with_fedot_nans(pd_column, has_nan) + nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() + column[nan_idxs] = FEDOT_STR_NAN # Convert into integers - converted_column = self._apply_encoder(replaced_column, column_id, has_nan) - else: - # Stay column the same - converted_column = input_data.features[:, column_id] - - converted_features.append(converted_column.reshape((-1, 1))) - - # Store transformed features - copied_data = deepcopy(input_data) - copied_data.features = np.hstack(converted_features) + column[:] = self._apply_encoder(column, column_id, nan_idxs) # Update features types features_types = copied_data.supplementary_data.column_types['features'] @@ -117,22 +103,16 @@ def _train_encoder(self, column: pd.Series, column_id: int): # Store fitted label encoder for transform method self.binary_encoders.update({column_id: encoder}) - def _apply_encoder(self, column: pd.Series, column_id: int, has_nan: pd.Series) -> np.ndarray: + def _apply_encoder(self, column: np.ndarray, column_id: int, nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray: """ Apply already fitted encoders """ encoder = self.binary_encoders[column_id] # Extend encoder classes if the column contains categories not previously encountered encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) converted = encoder.transform(column) - if len(has_nan) > 0: + if len(nan_idxs[0]): # Column has nans in its structure - after conversion replace it converted = converted.astype(float) - converted[has_nan] = np.nan + converted[nan_idxs] = np.nan return converted - - -def replace_nans_with_fedot_nans(column: pd.Series, has_nan: pd.Series) -> Tuple[pd.Series, pd.Series]: - # Add new category - 'fedot_nan' after converting it will be replaced by nans - column[has_nan] = FEDOT_STR_NAN - return column, has_nan diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 1082ea43e9..0298d456e2 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -217,8 +217,8 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) data.supplementary_data.column_types['features'] = [ - col_type - for col_id, col_type in enumerate(data.supplementary_data.column_types['features']) + col_type_id + for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features']) if col_id not in self.string_columns_transformation_failed ] @@ -429,9 +429,7 @@ def define_column_types(table: np.ndarray): table_of_types[nans] = TYPE_TO_ID[type(None)] columns_info = {} - for column_id in range(n_columns): - col_types = table_of_types[:, column_id] - + for column_id, col_types in enumerate(table_of_types.T): unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True) if len(unique_col_types) > 1: @@ -445,7 +443,7 @@ def define_column_types(table: np.ndarray): ] # Store information about nans in the target - nan_ids = np.where(nans[:, column_id])[0] + nan_ids = np.nonzero(nans[:, column_id])[0] columns_info.update({column_id: {'types': unique_col_types, 'str_number': str_number, 'int_number': int_number, From 08712b3d5c22e4a0d8614cda9648566d86a9d344 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 21 Feb 2023 20:01:27 +0300 Subject: [PATCH 23/72] set psutil req with the one from golem --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c23f18a05d..4e2e072802 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ joblib>=0.17.0 requests>=2.0 tqdm typing>=3.7.0 -psutil>=5.7.3 +psutil>=5.9.2 # Tests pytest>=6.2.0 From 571157cb2f90c5c29538af5a003abc38287f5d7d Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 28 Feb 2023 11:19:31 +0300 Subject: [PATCH 24/72] bug fix --- fedot/core/data/data_preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 10108e39bd..c8de456492 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: def replace_inf_with_nans(input_data: InputData): features = input_data.features - inf_idxs: Tuple[np.ndarray, ...] = ((features == np.inf) | (features == -np.inf)).nonzero() - if len(inf_idxs[0]): - features[inf_idxs] = np.nan + has_infs = ((features == np.inf) | (features == -np.inf)) + if np.any(has_infs): + features[has_infs] = np.nan def replace_nans_with_empty_strings(input_data: InputData): From b2e5f82235f95e5bfdb75a84f6df8a9cf4048883 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 28 Feb 2023 12:31:37 +0300 Subject: [PATCH 25/72] nan to num optimization --- .../models/discriminant_analysis.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py index 9a61c16eca..317e3d41a4 100644 --- a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py +++ b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py @@ -29,7 +29,7 @@ def predict(self, input_data): """ prediction = self.model.predict(input_data.features) - prediction = nan_to_num(prediction) + prediction = np.nan_to_num(prediction) return prediction @@ -40,7 +40,7 @@ def predict_proba(self, input_data): """ prediction = self.model.predict_proba(input_data.features) - prediction = nan_to_num(prediction) + prediction = np.nan_to_num(prediction) return prediction @@ -93,14 +93,3 @@ class QDAImplementation(DiscriminantAnalysisImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) self.model = QuadraticDiscriminantAnalysis(**self.params.to_dict()) - - -def nan_to_num(prediction): - """ Function converts nan values to numerical - - :return prediction: prediction without nans - """ - if np.array([pd.isna(_) for _ in prediction]).any(): - prediction = np.nan_to_num(prediction) - - return prediction From 4d42edb01fbffa6caebab6f8deaca434c84db585 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 28 Feb 2023 17:35:28 +0300 Subject: [PATCH 26/72] optimized cat features transform --- fedot/preprocessing/data_types.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 0298d456e2..b613595619 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -307,25 +307,24 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - _, n_cols = data.features.shape - for column_id in range(n_cols): + for column_id, column in enumerate(data.features.T): # For every int/float column perform check column_type = data.supplementary_data.column_types['features'][column_id] if column_type in [TYPE_TO_ID[int], TYPE_TO_ID[float]]: - numerical_column = pd.Series(data.features[:, column_id]) + pd_column = pd.Series(column) # Calculate number of unique values except nans - unique_numbers = len(numerical_column.dropna().unique()) + unique_numbers = len(pd_column.dropna().unique()) if 2 < unique_numbers < self.categorical_max_uniques_th: # Column need to be transformed into categorical (string) one self.numerical_into_str.append(column_id) # Convert into string - converted_array = convert_num_column_into_string_array(numerical_column) + converted_array = convert_num_column_into_string_array(pd_column) - # Store converted column into features table - data.features[:, column_id] = converted_array + # Store converted column into feature column + column[:] = converted_array # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] @@ -337,13 +336,14 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) # There is no transformation for current table return data - _, n_cols = data.features.shape - for column_id in range(n_cols): + for column_id, column in enumerate(data.features.T): if column_id in self.numerical_into_str: - numerical_column = pd.Series(data.features[:, column_id]) + pd_column = pd.Series(column) # Column must be converted into categorical - converted_array = convert_num_column_into_string_array(numerical_column) - data.features[:, column_id] = converted_array + converted_array = convert_num_column_into_string_array(pd_column) + + # Store converted column into feature column + column[:] = converted_array # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] From 8f96d2d47a291ef5c98a5c540b9da05a07bb3a75 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 30 Mar 2023 15:59:08 +0300 Subject: [PATCH 27/72] rid of for loops (v1) --- .../data_operations/categorical_encoders.py | 42 +++--- fedot/preprocessing/categorical.py | 68 ++++----- fedot/preprocessing/data_types.py | 133 ++++++++---------- 3 files changed, 112 insertions(+), 131 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 27607c00f2..b2c6d6f3c7 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -140,13 +140,8 @@ def transform(self, input_data: InputData) -> OutputData: """ copied_data = deepcopy(input_data) if self.categorical_ids: - # If categorical features are exists - transform them inplace in InputData - for categorical_id in self.categorical_ids: - categorical_column = input_data.features[:, categorical_id] - nan_idxs: Tuple[np.ndarray, ...] = pd.isna(categorical_column).nonzero() - - transformed = self._apply_label_encoder(categorical_column, categorical_id, nan_idxs) - copied_data.features[:, categorical_id] = transformed + # If categorical features exists - transform them inplace in InputData + self._apply_label_encoder(copied_data.features) output_data = self._convert_to_output(copied_data, copied_data.features) @@ -173,24 +168,25 @@ def _fit_label_encoders(self, input_data: InputData): self.encoders.update({categorical_id: le}) - def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int, - nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray: - """ Apply fitted LabelEncoder for column transformation - - :param categorical_column: numpy array with categorical features - :param categorical_id: index of current categorical column - :param nan_idxs: indices of gap elements in the ``categorical_column`` + def _apply_label_encoder(self, data: np.ndarray): """ - column_encoder = self.encoders[categorical_id] - column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column))) + Applies fitted LabelEncoder for all categorical features inplace - transformed_column = column_encoder.transform(categorical_column) - if len(nan_idxs[0]): - # Store np.nan values - transformed_column = transformed_column.astype(object) - transformed_column[nan_idxs] = np.nan - - return transformed_column + Args: + data: numpy array with all features + """ + categorical_columns = data[:, self.categorical_ids] + for column_id, column in zip(self.categorical_ids, categorical_columns.T): + column_encoder = self.encoders[column_id] + column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, column))) + + transformed_column = column_encoder.transform(column) + nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() + if len(nan_idxs[0]): + # Store np.nan values + transformed_column = transformed_column.astype(object) + transformed_column[nan_idxs] = np.nan + data[:, column_id] = transformed_column def get_params(self) -> OperationParameters: """ Due to LabelEncoder has no parameters - return empty set """ diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index fe5fa1eeaf..7c01f4b674 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -33,25 +33,24 @@ def fit(self, input_data: InputData): return self binary_ids_to_convert = [] - for column_id, column in enumerate(input_data.features.T): - pd_column = pd.Series(column, copy=True) + for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T): + pd_column = pd.Series(column, name=column_id, copy=True) is_nan = pd_column.isna() - column_uniques = pd_column.unique() - if is_nan.sum() and column_id in categorical_ids: + column_nuniques = pd_column.nunique(False) + if is_nan.sum(): # This categorical column has nans pd_column[is_nan] = FEDOT_STR_NAN - if len(column_uniques) <= 3: + if column_nuniques <= 3: # There is column with binary categories and gaps self.binary_features_with_nans.append(column_id) binary_ids_to_convert.append(column_id) - self._train_encoder(pd_column, column_id) - else: - if len(column_uniques) <= 2 and column_id in categorical_ids: - # Column contains binary string feature - binary_ids_to_convert.append(column_id) - # Train encoder for current column - self._train_encoder(pd_column, column_id) + self._train_encoder(pd_column) + elif column_nuniques <= 2: + # Column contains binary string feature + binary_ids_to_convert.append(column_id) + # Train encoder for current column + self._train_encoder(pd_column) self.binary_ids_to_convert = binary_ids_to_convert return self @@ -65,14 +64,7 @@ def transform(self, input_data: InputData) -> InputData: return input_data copied_data = deepcopy(input_data) - for column_id, column in enumerate(copied_data.features.T): - if column_id in self.binary_ids_to_convert: - # If column contains nans - replace them with fedot nans special string - nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() - column[nan_idxs] = FEDOT_STR_NAN - - # Convert into integers - column[:] = self._apply_encoder(column, column_id, nan_idxs) + self._apply_encoder(copied_data.features) # Update features types features_types = copied_data.supplementary_data.column_types['features'] @@ -93,7 +85,7 @@ def fit_transform(self, input_data: InputData) -> InputData: self.fit(input_data) return self.transform(input_data) - def _train_encoder(self, column: pd.Series, column_id: int): + def _train_encoder(self, column: pd.Series): """ Convert labels in the column from string into int via Label encoding. So, Label encoder is fitted to do such transformation. """ @@ -101,18 +93,26 @@ def _train_encoder(self, column: pd.Series, column_id: int): encoder.fit(column) # Store fitted label encoder for transform method - self.binary_encoders.update({column_id: encoder}) - - def _apply_encoder(self, column: np.ndarray, column_id: int, nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray: - """ Apply already fitted encoders """ - encoder = self.binary_encoders[column_id] - # Extend encoder classes if the column contains categories not previously encountered - encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) + self.binary_encoders.update({column.name: encoder}) - converted = encoder.transform(column) - if len(nan_idxs[0]): - # Column has nans in its structure - after conversion replace it - converted = converted.astype(float) - converted[nan_idxs] = np.nan + def _apply_encoder(self, data: np.ndarray): + """ + Applies already fitted encoders to all binary features inplace - return converted + Args: + data: numpy array with all features + """ + binary_columns = data[:, self.binary_ids_to_convert] + for column_id, column in zip(self.binary_ids_to_convert, binary_columns.T): + encoder = self.binary_encoders[column_id] + nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() + column[nan_idxs] = FEDOT_STR_NAN + # Extend encoder classes if the column contains categories not previously encountered + encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) + + converted = encoder.transform(column) + if len(nan_idxs[0]): + # Column has nans in its structure - after conversion replace it + converted = converted.astype(float) + converted[nan_idxs] = np.nan + data[:, column_id] = converted diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index b613595619..44a86d4fab 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -142,7 +142,7 @@ def features_types_converting(self, features: np.ndarray) -> np.array: for mixed_column_id in features_with_mixed_types: column_info = self.features_columns_info[mixed_column_id] - if column_info.get('str_number') > 0 or column_info.get('float_number') > 0: + if column_info.get('str_number') or column_info.get('float_number'): # There are string elements in the array mixed_column = features[:, mixed_column_id] updated_column, new_type_name = self._convert_feature_into_one_type(mixed_column, column_info, @@ -170,7 +170,7 @@ def target_types_converting(self, target: np.ndarray, task: Task) -> np.array: for mixed_column_id in target_with_mixed_types: column_info = self.target_columns_info[mixed_column_id] - if column_info.get('str_number') > 0: + if column_info.get('str_number'): # There are string elements in the array mixed_column = target[:, mixed_column_id] updated_column, new_type_name = self._convert_target_into_one_type(mixed_column, column_info, @@ -307,28 +307,21 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - for column_id, column in enumerate(data.features.T): - # For every int/float column perform check - column_type = data.supplementary_data.column_types['features'][column_id] - if column_type in [TYPE_TO_ID[int], TYPE_TO_ID[float]]: - pd_column = pd.Series(column) - - # Calculate number of unique values except nans - unique_numbers = len(pd_column.dropna().unique()) - - if 2 < unique_numbers < self.categorical_max_uniques_th: - # Column need to be transformed into categorical (string) one - self.numerical_into_str.append(column_id) - - # Convert into string - converted_array = convert_num_column_into_string_array(pd_column) - - # Store converted column into feature column - column[:] = converted_array - - # Update information about column types (in-place) - features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = TYPE_TO_ID[str] + features_types = data.supplementary_data.column_types['features'] + is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) + numeric_type_ids = np.nonzero(is_numeric_type)[0] + num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) + nuniques = num_df.nunique(dropna=True) + # reduce dataframe to include only categorical features + num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)] + cat_col_ids = num_df.columns + # Convert into string + data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy() + # Columns need to be transformed into categorical (string) ones + self.numerical_into_str.extend(cat_col_ids) + for column_id in cat_col_ids: + # Update information about column types (in-place) + features_types[column_id] = TYPE_TO_ID[str] def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -336,18 +329,16 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) # There is no transformation for current table return data - for column_id, column in enumerate(data.features.T): - if column_id in self.numerical_into_str: - pd_column = pd.Series(column) - # Column must be converted into categorical - converted_array = convert_num_column_into_string_array(pd_column) - - # Store converted column into feature column - column[:] = converted_array - - # Update information about column types (in-place) - features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = TYPE_TO_ID[str] + # Get numerical columns + num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str) + + # Convert and apply categorical transformation + data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy() + + # Update information about column types (in-place) + features_types = data.supplementary_data.column_types['features'] + for column_id in self.numerical_into_str: + features_types[column_id] = TYPE_TO_ID[str] def _into_numeric_features_transformation_for_fit(self, data: InputData): """ @@ -396,7 +387,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): # There is no transformation for current table return data - n_rows, n_cols = data.features.shape + _, n_cols = data.features.shape for column_id in range(n_cols): if column_id in self.categorical_into_float and column_id not in self.string_columns_transformation_failed: string_column = pd.Series(data.features[:, column_id]) @@ -418,8 +409,7 @@ def define_column_types(table: np.ndarray): if table is None: return {} - _, n_columns = table.shape - + #df_of_types = pd.DataFrame(table_of_types).transform() nans = pd.isna(table) table_of_types = np.empty_like(table, dtype=np.int8) table_of_types[~nans] = [ @@ -428,43 +418,39 @@ def define_column_types(table: np.ndarray): ] table_of_types[nans] = TYPE_TO_ID[type(None)] - columns_info = {} - for column_id, col_types in enumerate(table_of_types.T): - unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True) + table_of_types = pd.DataFrame(table_of_types) - if len(unique_col_types) > 1: - numbers = [ - unique_col_types_number[unique_col_types == TYPE_TO_ID[t]] - for t in [str, int, float] - ] - str_number, int_number, float_number = [ - number.item() if len(number) else 0 - for number in numbers - ] + # Build dataframe with unique types for each column + uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'}) - # Store information about nans in the target - nan_ids = np.nonzero(nans[:, column_id])[0] - columns_info.update({column_id: {'types': unique_col_types, - 'str_number': str_number, - 'int_number': int_number, - 'float_number': float_number, - 'nan_number': len(nan_ids), - 'nan_ids': nan_ids}}) - else: - # There is only one type, or several types such as int and float - columns_info.update({column_id: {'types': unique_col_types}}) - return columns_info + # Build dataframe with amount of each type + counts_index_mapper = { + TYPE_TO_ID[str]: 'str_number', + TYPE_TO_ID[int]: 'int_number', + TYPE_TO_ID[float]: 'float_number', + TYPE_TO_ID[type(None)]: 'nan_number' + } + types_counts = ( + table_of_types + .apply(pd.value_counts, dropna=False) + .reindex(counts_index_mapper.keys(), copy=False) + .replace(np.nan, 0) + .rename(index=counts_index_mapper, copy=False) + .astype(int) + ) + + # Build dataframe with nans indices + nans_ids = pd.DataFrame(nans).apply(np.where).rename(index={0: 'nan_ids'}) + return pd.concat([uniques, types_counts, nans_ids]).to_dict() def find_mixed_types_columns(columns_info: dict): """ Search for columns with several types in them """ - columns_with_mixed_types = [] - for column_id, information in columns_info.items(): - column_types = information['types'] - if len(column_types) > 1: - columns_with_mixed_types.append(column_id) - - return columns_with_mixed_types + return [ + col_id + for col_id, col_info in columns_info.items() + if len(col_info['types']) > 1 + ] def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter): @@ -510,9 +496,8 @@ def _obtain_new_column_type(column_info: dict): if column_info['float_number'] > 0 or column_info['nan_number'] > 0: # Even if one of types are float - all elements should be converted into float return float - else: - # It is available to convert numerical into integer type - return int + # It is available to convert numerical into integer type + return int def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray, @@ -549,7 +534,7 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) filtered_types = [x for x in column_types_ids if x != TYPE_TO_ID[type(None)]] updated_column_types.append(filtered_types[0]) else: - if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_types_ids): + if TYPE_TO_ID[str] in column_types_ids: # Mixed-types column with string new_column_type = converted_columns[column_id] if new_column_type != -1: From 7568b2bfb857b50db931a462fbd49e4f39783167 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 18 Apr 2023 17:39:12 +0300 Subject: [PATCH 28/72] compound names fix --- fedot/core/data/data_preprocessing.py | 23 ++++++++++--------- .../data_operations/categorical_encoders.py | 8 +++---- .../sklearn_transformations.py | 4 ++-- fedot/preprocessing/categorical.py | 6 ++--- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index c8de456492..bb060047ce 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -1,4 +1,4 @@ -from typing import Tuple, Optional +from typing import Tuple, Optional, List import numpy as np import pandas as pd @@ -55,28 +55,29 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li # Only categorical categorical_input = input_data.subset_features(categorical_ids) return None, categorical_input - else: prefix = 'InputData contains no categorical and no numerical features.' raise ValueError(f'{prefix} Check data for Nans and inf values') -def find_categorical_columns(table: np.ndarray, column_types: dict = None): +def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[List[int]] = None): """ Method for finding categorical and non-categorical columns in tabular data - :param table: tabular data for string columns types determination - :param column_types: list with column types. If None, perform default checking - :return categorical_ids: indices of categorical columns in table - :return non_categorical_ids: indices of non categorical columns in table + Args: + table: tabular data for string columns types determination. + column_type_ids: list with column types. If None, perform default checking. + Returns: + categorical_ids: indices of categorical columns in table. + non_categorical_ids: indices of non categorical columns in table. """ - if column_types is None: + if column_type_ids is None: # Define if data contains string columns for "unknown table" return force_categorical_determination(table) categorical_ids = [] non_categorical_ids = [] - for col_id, col_type_id in enumerate(column_types): + for col_id, col_type_id in enumerate(column_type_ids): if col_type_id == TYPE_TO_ID[str]: categorical_ids.append(col_id) else: @@ -113,8 +114,8 @@ def data_has_categorical_features(data: InputData) -> bool: if data.data_type is not DataTypesEnum.table: return False - features_types = data.supplementary_data.column_types.get('features') - cat_ids, non_cat_ids = find_categorical_columns(data.features, features_types) + column_type_ids = data.supplementary_data.column_types.get('features') + cat_ids, non_cat_ids = find_categorical_columns(data.features, column_type_ids) data_has_categorical_columns = len(cat_ids) > 0 data.numerical_idx = non_cat_ids diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index b2c6d6f3c7..361ccc2eee 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -36,9 +36,9 @@ def fit(self, input_data: InputData): :return encoder: trained encoder (optional output) """ features = input_data.features - features_types_ids = input_data.supplementary_data.column_types.get('features') + column_type_ids = input_data.supplementary_data.column_types.get('features') categorical_ids, non_categorical_ids = find_categorical_columns(features, - features_types_ids) + column_type_ids) # Indices of columns with categorical and non-categorical features self.categorical_ids = categorical_ids @@ -124,9 +124,9 @@ def __init__(self, params: Optional[OperationParameters] = None): self.non_categorical_ids = None def fit(self, input_data: InputData): - features_types = input_data.supplementary_data.column_types.get('features') + column_type_ids = input_data.supplementary_data.column_types.get('features') self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, - features_types) + column_type_ids) # If there are categorical features - process it if self.categorical_ids: diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 47f77ba9e0..530eb84320 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -290,9 +290,9 @@ def transform(self, input_data: InputData) -> OutputData: replace_inf_with_nans(input_data) if data_type_is_table(input_data) and data_has_categorical_features(input_data): - features_types = input_data.supplementary_data.column_types.get('features') + column_type_ids = input_data.supplementary_data.column_types.get('features') self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, - features_types) + column_type_ids) numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids, self.non_categorical_ids) diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 7c01f4b674..382b927106 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -25,9 +25,9 @@ def fit(self, input_data: InputData): Find indices of columns which are contains categorical values. Binary features and at the same time has str objects. If there are such features - convert it into int """ - features_types = input_data.supplementary_data.column_types['features'] - categorical_ids, _ = find_categorical_columns(table=input_data.features, - column_types=features_types) + column_type_ids = input_data.supplementary_data.column_types['features'] + categorical_ids, _ = find_categorical_columns(input_data.features, + column_type_ids) if len(categorical_ids) == 0: # There is no need to process categorical features return self From d566947ba8076249e4297b694bdf8bea5a365e48 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 18 Apr 2023 17:42:02 +0300 Subject: [PATCH 29/72] simplified data_preprocessing.py --- fedot/core/data/data.py | 9 ++++++--- fedot/core/data/data_preprocessing.py | 18 +++--------------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 0247638e96..bf06dbf87b 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -530,11 +530,14 @@ def subset_indices(self, selected_idx: List): target=self.target[row_nums], task=self.task, data_type=self.data_type) - def subset_features(self, features_ids: list): - """Return new :obj:`InputData` with subset of features based on ``features_ids`` list + def subset_features(self, feature_ids: list) -> Optional[InputData]: """ + Return new :obj:`InputData` with subset of features based on non-empty ``features_ids`` list or `None` otherwise + """ + if not feature_ids: + return None - subsample_features = self.features[:, features_ids] + subsample_features = self.features[:, feature_ids] subsample_input = InputData(features=subsample_features, data_type=self.data_type, target=self.target, task=self.task, diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index bb060047ce..7c35cda649 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -39,22 +39,10 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li Split tabular InputData into two parts: with numerical and categorical features using list with ids of categorical and numerical features. """ - - if len(categorical_ids) > 0 and len(non_categorical_ids) > 0: - # Both categorical and numerical features - numerical_input = input_data.subset_features(non_categorical_ids) - categorical_input = input_data.subset_features(categorical_ids) + numerical_input = input_data.subset_features(non_categorical_ids) + categorical_input = input_data.subset_features(categorical_ids) + if not (numerical_input or categorical_input): return numerical_input, categorical_input - - elif len(categorical_ids) == 0 and len(non_categorical_ids) > 0: - # Only numerical - numerical_input = input_data.subset_features(non_categorical_ids) - return numerical_input, None - - elif len(categorical_ids) > 0 and len(non_categorical_ids) == 0: - # Only categorical - categorical_input = input_data.subset_features(categorical_ids) - return None, categorical_input else: prefix = 'InputData contains no categorical and no numerical features.' raise ValueError(f'{prefix} Check data for Nans and inf values') From eb7e28a1a15cf6ee03e416296eea9641247cfbe3 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 19 Apr 2023 17:19:06 +0300 Subject: [PATCH 30/72] data_preprocessing logic fix --- fedot/core/data/data_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 7c35cda649..b241e2cf43 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -41,7 +41,7 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li """ numerical_input = input_data.subset_features(non_categorical_ids) categorical_input = input_data.subset_features(categorical_ids) - if not (numerical_input or categorical_input): + if numerical_input or categorical_input: return numerical_input, categorical_input else: prefix = 'InputData contains no categorical and no numerical features.' From 2247e0227d741e92f48a3b6a18c3e355c886d875 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 19 Apr 2023 17:57:11 +0300 Subject: [PATCH 31/72] numpy's nonzero to flatzero --- .../data_operations/categorical_encoders.py | 10 +++++----- fedot/preprocessing/categorical.py | 4 ++-- fedot/preprocessing/preprocessing.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 361ccc2eee..6213c53a75 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Optional, Tuple +from typing import Optional, List import numpy as np import pandas as pd @@ -120,8 +120,8 @@ def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) # LabelEncoder has no parameters self.encoders = {} - self.categorical_ids = None - self.non_categorical_ids = None + self.categorical_ids: List[int] = None + self.non_categorical_ids: List[int] = None def fit(self, input_data: InputData): column_type_ids = input_data.supplementary_data.column_types.get('features') @@ -181,8 +181,8 @@ def _apply_label_encoder(self, data: np.ndarray): column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, column))) transformed_column = column_encoder.transform(column) - nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() - if len(nan_idxs[0]): + nan_idxs = np.flatnonzero(pd.isna(column)) + if len(nan_idxs): # Store np.nan values transformed_column = transformed_column.astype(object) transformed_column[nan_idxs] = np.nan diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 382b927106..750379bff7 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -105,13 +105,13 @@ def _apply_encoder(self, data: np.ndarray): binary_columns = data[:, self.binary_ids_to_convert] for column_id, column in zip(self.binary_ids_to_convert, binary_columns.T): encoder = self.binary_encoders[column_id] - nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() + nan_idxs = np.flatnonzero(pd.isna(column)) column[nan_idxs] = FEDOT_STR_NAN # Extend encoder classes if the column contains categories not previously encountered encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) converted = encoder.transform(column) - if len(nan_idxs[0]): + if len(nan_idxs): # Column has nans in its structure - after conversion replace it converted = converted.astype(float) converted[nan_idxs] = np.nan diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 04bcf106a5..5186a1b84c 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -269,7 +269,7 @@ def _find_features_lacking_nans(self, data: InputData, source_name: str): features = data.features axes_except_cols = (0,) + tuple(range(2, features.ndim)) are_allowed = np.mean(pd.isna(features), axis=axes_except_cols) < ALLOWED_NAN_PERCENT - self.ids_relevant_features[source_name] = np.nonzero(are_allowed)[0] + self.ids_relevant_features[source_name] = np.flatnonzero(are_allowed) @staticmethod def _drop_rows_with_nan_in_target(data: InputData) -> InputData: From 9443d2169c694f1230c8fa600ab414d270b133c8 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 19 Apr 2023 17:58:22 +0300 Subject: [PATCH 32/72] simplified data_types.py --- fedot/preprocessing/data_types.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 44a86d4fab..ec6754f179 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,7 +1,7 @@ from __future__ import annotations from copy import copy -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Tuple, Optional, Dict import numpy as np import pandas as pd @@ -128,17 +128,12 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict): table = np.delete(table, self.columns_to_del, 1) return table - def features_types_converting(self, features: np.ndarray) -> np.array: + def features_types_converting(self, features: np.ndarray) -> np.ndarray: """ Convert all elements in the data in every feature column into one type :param features: tabular features array """ features_with_mixed_types = find_mixed_types_columns(self.features_columns_info) - - if not features_with_mixed_types: - return features - - # There are mixed-types columns in features table - convert them for mixed_column_id in features_with_mixed_types: column_info = self.features_columns_info[mixed_column_id] @@ -309,7 +304,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): """ features_types = data.supplementary_data.column_types['features'] is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) - numeric_type_ids = np.nonzero(is_numeric_type)[0] + numeric_type_ids = np.flatnonzero(is_numeric_type) num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) nuniques = num_df.nunique(dropna=True) # reduce dataframe to include only categorical features @@ -401,7 +396,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): features_types[column_id] = TYPE_TO_ID[float] -def define_column_types(table: np.ndarray): +def define_column_types(table: Optional[np.ndarray]) -> Dict: """ Prepare information about types per columns. For each column store unique types, which column contains. If column with mixed type contain str object additional field 'str_ids' with indices of string objects is prepared @@ -409,16 +404,8 @@ def define_column_types(table: np.ndarray): if table is None: return {} - #df_of_types = pd.DataFrame(table_of_types).transform() - nans = pd.isna(table) - table_of_types = np.empty_like(table, dtype=np.int8) - table_of_types[~nans] = [ - TYPE_TO_ID[type(x.item() if isinstance(x, (np.ndarray, np.generic)) else x)] - for x in table[~nans] - ] - table_of_types[nans] = TYPE_TO_ID[type(None)] - - table_of_types = pd.DataFrame(table_of_types) + table_of_types = pd.DataFrame(table, copy=True) + table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8) # Build dataframe with unique types for each column uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'}) @@ -440,7 +427,9 @@ def define_column_types(table: np.ndarray): ) # Build dataframe with nans indices - nans_ids = pd.DataFrame(nans).apply(np.where).rename(index={0: 'nan_ids'}) + nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: 'nan_ids'}) + + # Combine all dataframes return pd.concat([uniques, types_counts, nans_ids]).to_dict() From adc77b6d678d6f6a6fd217953150a693340a4e40 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 27 Apr 2023 14:07:55 +0300 Subject: [PATCH 33/72] further opts --- fedot/preprocessing/data_types.py | 289 ++++++++++++++---------------- 1 file changed, 132 insertions(+), 157 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index ec6754f179..0e4f699c06 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,7 +1,7 @@ from __future__ import annotations from copy import copy -from typing import TYPE_CHECKING, Tuple, Optional, Dict +from typing import TYPE_CHECKING, Tuple, Optional, List, Dict import numpy as np import pandas as pd @@ -17,6 +17,13 @@ TYPE_TO_ID = dict(zip(_convertable_types, _types_ids)) +_TYPES = 'types' +_FLOAT_NUMBER = 'float_number' +_INT_NUMBER = 'int_number' +_STR_NUMBER = 'str_number' +_NAN_NUMBER = 'nan_number' +_NAN_IDS = 'nan_ids' + FEDOT_STR_NAN = 'fedot_nan' # If unique values in the feature column is less than 13 - convert column into string type else to numerical CATEGORICAL_MAX_UNIQUE_TH = 13 @@ -38,8 +45,8 @@ def __init__(self): self.acceptable_failed_rate_bottom = ACCEPTABLE_CONVERSION_FAILED_RATE_BOTTOM self.acceptable_failed_rate_top = ACCEPTABLE_CONVERSION_FAILED_RATE_TOP - self.features_columns_info = {} - self.target_columns_info = {} + self.features_columns_info = pd.DataFrame() + self.target_columns_info = pd.DataFrame() # Dictionary with information about converted during fitting columns self.features_converted_columns = {} @@ -116,15 +123,7 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict): :param table: tabular dataset based on which new dataset will be generated :param converted_columns: dictionary with actions with table """ - if not converted_columns: - return table - - self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id == -1] - if not self.columns_to_del: - # There are no columns to delete - return table - - # Remove all "bad" columns + self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id is None] table = np.delete(table, self.columns_to_del, 1) return table @@ -133,48 +132,37 @@ def features_types_converting(self, features: np.ndarray) -> np.ndarray: :param features: tabular features array """ - features_with_mixed_types = find_mixed_types_columns(self.features_columns_info) - for mixed_column_id in features_with_mixed_types: - column_info = self.features_columns_info[mixed_column_id] + mixed_types_columns = _find_mixed_types_columns(self.features_columns_info) + cols_with_strings_or_floats = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER, _FLOAT_NUMBER]) - if column_info.get('str_number') or column_info.get('float_number'): - # There are string elements in the array - mixed_column = features[:, mixed_column_id] - updated_column, new_type_name = self._convert_feature_into_one_type(mixed_column, column_info, - mixed_column_id) - # Store information about converted columns - self.features_converted_columns.update({mixed_column_id: new_type_name}) + def _update_converted_columns_and_data(column_info: pd.Series): + updated_column, new_type_id = self._convert_feature_into_one_type(features[:, column_info.name], + column_info) + self.features_converted_columns[column_info.name] = new_type_id + if updated_column is not None: + features[:, column_info.name] = updated_column - if updated_column is not None: - features[:, mixed_column_id] = updated_column + cols_with_strings_or_floats.apply(_update_converted_columns_and_data) return features - def target_types_converting(self, target: np.ndarray, task: Task) -> np.array: + def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray: """ Convert all elements in every target column into one type :param target: tabular target array :param task: task to solve """ - target_with_mixed_types = find_mixed_types_columns(self.target_columns_info) - - if not target_with_mixed_types: - return target - - # There are mixed-types columns in features table - convert them - for mixed_column_id in target_with_mixed_types: - column_info = self.target_columns_info[mixed_column_id] + mixed_types_columns = _find_mixed_types_columns(self.target_columns_info) + cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER]) - if column_info.get('str_number'): - # There are string elements in the array - mixed_column = target[:, mixed_column_id] - updated_column, new_type_name = self._convert_target_into_one_type(mixed_column, column_info, - mixed_column_id, task) - # Store information about converted columns - self.target_converted_columns.update({mixed_column_id: new_type_name}) + def _update_converted_columns_and_data(column_info: pd.Series): + updated_column, new_type_id = self._convert_target_into_one_type(target[:, column_info.name], column_info, + task) + self.target_converted_columns[column_info.name] = new_type_id + if updated_column is not None: + target[:, column_info.name] = updated_column - if updated_column is not None: - target[:, mixed_column_id] = updated_column + cols_with_strings.apply(_update_converted_columns_and_data) return target @@ -183,11 +171,11 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = """ Prepare information about columns in a form of dictionary Dictionary has two keys: 'target' and 'features' """ - if not self.features_columns_info: + if self.features_columns_info.empty: # Information about column types is empty - there is a need to launch algorithm to collect info self.features_columns_info = define_column_types(predictors) predictors = self.features_types_converting(features=predictors) - if not self.target_columns_info and task.task_type is not TaskTypesEnum.ts_forecasting: + if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting: self.target_columns_info = define_column_types(target) target = self.target_types_converting(target=target, task=task) @@ -205,7 +193,7 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): """ Update information in supplementary info - retain info only about remained columns. Such columns have no conflicts with types converting. """ - if len(self.string_columns_transformation_failed) > 0: + if self.string_columns_transformation_failed: self.log.warning(f'Columns with indices {list(self.string_columns_transformation_failed.keys())} were ' f'removed during mixed types column converting due to conflicts.') @@ -225,32 +213,32 @@ def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list): self.log.warning('Columns number and types numbers do not match.') @staticmethod - def _remove_pseudo_str_values_from_str_column(data: pd.DataFrame, column_id: int): + def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index): """ Removes from truly str column all pseudo str values """ - cur_column = data.features[:, column_id] - converted_column = [] - for i in range(len(cur_column)): - try: - float(cur_column[i]) - converted_column.append(np.nan) - except ValueError: - converted_column.append(cur_column[i]) - data.features[:, column_id] = pd.Series(converted_column).values - - def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int): + for col_id in columns: + for row_id, item in enumerate(data.features[:, col_id]): + try: + float(item) + except ValueError: + continue + else: + # item is numeric, remove its value + data.features[row_id, col_id] = np.nan + + def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series): """ Determine new type for current feature column based on the string ratio. And then convert column into it. :param mixed_column: one-dimensional array with several data types :param column_info: dictionary with information about types in the column :param mixed_column_id: index of column in dataset """ - if len(column_info['types']) == 2 and TYPE_TO_ID[type(None)] in column_info['types']: + if len(column_info[_TYPES]) == 2 and TYPE_TO_ID[type(None)] in column_info[_TYPES]: # Column contain only one data type and nans - filtered_types = [x for x in column_info['types'] if x != TYPE_TO_ID[type(None)]] + filtered_types = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]] return mixed_column, filtered_types[0] - string_objects_number = column_info['str_number'] - all_elements_number = string_objects_number + column_info['int_number'] + column_info['float_number'] + string_objects_number = column_info[_STR_NUMBER] + all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum() string_ratio = string_objects_number / all_elements_number if string_ratio > 0: @@ -261,19 +249,19 @@ def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: try: mixed_column = mixed_column.astype(suggested_type) # If there were nans in the column - paste nan - if column_info['nan_number'] > 0: + if column_info[_NAN_NUMBER]: mixed_column = mixed_column.astype(object) - mixed_column[column_info['nan_ids']] = np.nan - del column_info['nan_ids'] + mixed_column[column_info[_NAN_IDS]] = np.nan + del column_info[_NAN_IDS] return mixed_column, TYPE_TO_ID[suggested_type] except ValueError: # Cannot convert string objects into int or float (for example 'a' into int) - prefix = f'Feature column with index {mixed_column_id} contains ' \ - f'following data types: {column_info["types"]}.' + prefix = f'Feature column with index {column_info.name} contains ' \ + f'following data types: {column_info[_TYPES]}.' self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.') - return None, -1 + return None, None - def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int, + def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series, task: Task) -> Tuple[np.ndarray, str]: """ Convert target columns into one type based on column proportions of object and task """ if task.task_type is TaskTypesEnum.classification: @@ -290,8 +278,8 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: d target_column = pd.Series(mixed_column) converted_column = pd.to_numeric(target_column, errors='coerce') - prefix = f'Target column with index {mixed_column_id} contains ' \ - f'following data types: {column_info["types"]}.' + prefix = (f'Target column with index {column_info.name} contains ' + f'following data types: {column_info[_TYPES]}.') log_message = f'{prefix} String cannot be converted into {suggested_type}. Ignore non converted values.' self.log.debug(log_message) self.target_converting_has_errors = True @@ -313,7 +301,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): # Convert into string data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy() # Columns need to be transformed into categorical (string) ones - self.numerical_into_str.extend(cat_col_ids) + self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) for column_id in cat_col_ids: # Update information about column types (in-place) features_types[column_id] = TYPE_TO_ID[str] @@ -326,10 +314,10 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) # Get numerical columns num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str) - + # Convert and apply categorical transformation data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy() - + # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] for column_id in self.numerical_into_str: @@ -339,107 +327,94 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): """ Automatically determine categorical features which should be converted into float """ - n_rows, n_cols = data.features.shape - for column_id in range(n_cols): - # For every string column perform converting if necessary - column_type = data.supplementary_data.column_types['features'][column_id] - if column_type == TYPE_TO_ID[str]: - string_column = pd.Series(data.features[:, column_id]) - - # Number of nans in the column - nans_number = string_column.isna().sum() - - # Column probably not an "actually categorical" but a column with an incorrectly defined type - converted_column = pd.to_numeric(string_column, errors='coerce') - # Calculate applied nans - result_nans_number = converted_column.isna().sum() - failed_objects_number = result_nans_number - nans_number - non_nan_all_objects_number = n_rows - nans_number - failed_ratio = failed_objects_number / non_nan_all_objects_number - - # If all objects are truly strings - all objects transform into nan - is_column_contain_numerical_objects = failed_ratio != 1 - if failed_ratio < self.acceptable_failed_rate_bottom: - # The majority of objects can be converted into numerical - data.features[:, column_id] = converted_column.values - - # Update information about column types (in-place) - self.categorical_into_float.append(column_id) - features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = TYPE_TO_ID[float] - elif failed_ratio >= self.acceptable_failed_rate_top \ - and is_column_contain_numerical_objects: - # The column consists mostly of truly str values and has a few ints/floats in it - self._remove_pseudo_str_values_from_str_column(data, column_id) - elif self.acceptable_failed_rate_top > failed_ratio >= self.acceptable_failed_rate_bottom: - # Probably numerical column contains a lot of '?' or 'x' as nans equivalents - # Add columns to remove list - self.string_columns_transformation_failed.update({column_id: -1}) - - def _into_numeric_features_transformation_for_predict(self, data: InputData): - """ Apply conversion into float string column for every signed column """ - if not self.categorical_into_float: - # There is no transformation for current table - return data + str_columns = [ + column_id for column_id, _ in enumerate(data.features.T) + if data.supplementary_data.column_types['features'][column_id] == TYPE_TO_ID[str]] + str_cols_df = pd.DataFrame(data.features[:, str_columns], columns=str_columns) + orig_nans_cnt = str_cols_df.isna().sum(axis=0) + + converted_str_cols_df = str_cols_df.apply(pd.to_numeric, errors='coerce') + conv_nans_cnt = converted_str_cols_df.isna().sum(axis=0) + + failed_objects_cnt = conv_nans_cnt - orig_nans_cnt + non_nan_all_objects_cnt = len(data.features) - orig_nans_cnt + failed_ratio = failed_objects_cnt / non_nan_all_objects_cnt + + # Check if the majority of objects can be converted into numerical + is_numeric = failed_ratio < self.acceptable_failed_rate_bottom + is_numeric_ids = is_numeric[is_numeric].index + data.features[:, is_numeric_ids] = converted_str_cols_df[is_numeric_ids].to_numpy() + self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float)) + features_types = data.supplementary_data.column_types['features'] + for column_id in is_numeric_ids: + features_types[column_id] = TYPE_TO_ID[float] - _, n_cols = data.features.shape - for column_id in range(n_cols): - if column_id in self.categorical_into_float and column_id not in self.string_columns_transformation_failed: - string_column = pd.Series(data.features[:, column_id]) + # The columns consists mostly of truly str values and has a few ints/floats in it + is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1) + self._remove_pseudo_str_values_from_str_column(data, is_mixed[is_mixed].index) - # Column must be converted into float from categorical - converted_column = pd.to_numeric(string_column, errors='coerce') - data.features[:, column_id] = converted_column.values + # If column contains a lot of '?' or 'x' as nans equivalents + # add it remove list + is_of_mistakes = ( + (self.acceptable_failed_rate_bottom <= failed_ratio) + & (failed_ratio < self.acceptable_failed_rate_top)) + self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index)) - # Update information about column types (in-place) - features_types = data.supplementary_data.column_types['features'] - features_types[column_id] = TYPE_TO_ID[float] + def _into_numeric_features_transformation_for_predict(self, data: InputData): + """ Apply conversion into float string column for every signed column """ + str_cols_ids = list(set(self.categorical_into_float) + .difference(self.string_columns_transformation_failed)) + str_cols_df = pd.DataFrame(data.features[:, str_cols_ids], columns=str_cols_ids) + data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() + for column_id in str_cols_ids: + data.supplementary_data.column_types['features'][column_id] = TYPE_TO_ID[float] -def define_column_types(table: Optional[np.ndarray]) -> Dict: +def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: """ Prepare information about types per columns. For each column store unique - types, which column contains. If column with mixed type contain str object - additional field 'str_ids' with indices of string objects is prepared + types, which column contains. """ if table is None: - return {} + return pd.DataFrame() table_of_types = pd.DataFrame(table, copy=True) table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8) # Build dataframe with unique types for each column - uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'}) + uniques = table_of_types.apply([pd.unique]).rename(index={'unique': _TYPES}) # Build dataframe with amount of each type counts_index_mapper = { - TYPE_TO_ID[str]: 'str_number', - TYPE_TO_ID[int]: 'int_number', - TYPE_TO_ID[float]: 'float_number', - TYPE_TO_ID[type(None)]: 'nan_number' + TYPE_TO_ID[float]: _FLOAT_NUMBER, + TYPE_TO_ID[int]: _INT_NUMBER, + TYPE_TO_ID[str]: _STR_NUMBER, + TYPE_TO_ID[type(None)]: _NAN_NUMBER } types_counts = ( table_of_types - .apply(pd.value_counts, dropna=False) - .reindex(counts_index_mapper.keys(), copy=False) - .replace(np.nan, 0) - .rename(index=counts_index_mapper, copy=False) - .astype(int) + .apply(pd.value_counts, dropna=False) + .reindex(counts_index_mapper.keys(), copy=False) + .replace(np.nan, 0) + .rename(index=counts_index_mapper, copy=False) + .astype(int) ) # Build dataframe with nans indices - nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: 'nan_ids'}) + nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: _NAN_IDS}) # Combine all dataframes - return pd.concat([uniques, types_counts, nans_ids]).to_dict() + return pd.concat([uniques, types_counts, nans_ids]) -def find_mixed_types_columns(columns_info: dict): +def _find_mixed_types_columns(columns_info: pd.DataFrame) -> pd.DataFrame: """ Search for columns with several types in them """ - return [ - col_id - for col_id, col_info in columns_info.items() - if len(col_info['types']) > 1 - ] + has_mixed_types = [] if columns_info.empty else columns_info.loc[_TYPES].apply(len) > 1 + return columns_info.loc[:, has_mixed_types] + + +def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> pd.DataFrame: + _cols_have_any = [] if frame.empty else frame.loc[rows_to_select].any() + return frame.loc[:, _cols_have_any] def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter): @@ -462,7 +437,7 @@ def type_by_id(current_type_id: int): # Occurs if for predict stage there is no target info return None - n_rows, n_cols = table.shape + _, n_cols = table.shape for column_id in range(n_cols): current_column = table[:, column_id] current_type = type_by_id(column_types[column_id]) @@ -472,17 +447,17 @@ def type_by_id(current_type_id: int): return table -def convert_num_column_into_string_array(numerical_column: pd.Series) -> np.array: +def convert_num_column_into_string_array(numerical_column: pd.Series) -> pd.Series: """ Convert pandas column into numpy one-dimensional array """ # convert only non-nans values true_nums = numerical_column[numerical_column.notna()] numerical_column[true_nums.index] = true_nums.astype(str, copy=False) - return numerical_column.to_numpy() + return numerical_column -def _obtain_new_column_type(column_info: dict): +def _obtain_new_column_type(column_info: pd.Series): """ Suggest in or float type based on the presence of nan and float values """ - if column_info['float_number'] > 0 or column_info['nan_number'] > 0: + if column_info[[_FLOAT_NUMBER, _NAN_NUMBER]].any(): # Even if one of types are float - all elements should be converted into float return float # It is available to convert numerical into integer type @@ -505,7 +480,7 @@ def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: current_type=current_type) -def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) -> list: +def _generate_list_with_types(columns_types_info: pd.DataFrame, converted_columns: Dict[int, Optional[int]]) -> list: """ Create list with types for all remained columns :param columns_types_info: dictionary with initial column types @@ -513,7 +488,7 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) """ updated_column_types = [] for column_id, column_info in columns_types_info.items(): - column_types_ids = column_info['types'] + column_types_ids = column_info[_TYPES] if len(column_types_ids) == 1: # Column initially contain only one type @@ -525,9 +500,9 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) else: if TYPE_TO_ID[str] in column_types_ids: # Mixed-types column with string - new_column_type = converted_columns[column_id] - if new_column_type != -1: - updated_column_types.append(new_column_type) + new_col_id = converted_columns[column_id] + if new_col_id is not None: + updated_column_types.append(new_col_id) else: # Mixed-types with float and integer updated_column_types.append(TYPE_TO_ID[float]) From 55701809f086f1f9b84b0758efb066fecba8764f Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Thu, 27 Apr 2023 17:42:08 +0300 Subject: [PATCH 34/72] cats ids via numpy --- fedot/api/api_utils/input_analyser.py | 13 +--- fedot/core/data/data_preprocessing.py | 14 ++--- .../data_operations/categorical_encoders.py | 7 +-- fedot/preprocessing/categorical.py | 4 -- fedot/preprocessing/data_types.py | 59 ++++++++++--------- 5 files changed, 41 insertions(+), 56 deletions(-) diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index 1c524320a3..3b9c5f2c23 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -1,9 +1,8 @@ from functools import partial from inspect import signature - -import numpy as np from typing import Dict, Tuple, Any, Union +import numpy as np from golem.core.log import default_log from fedot.core.composer.meta_rules import get_cv_folds_number, get_recommended_preset, \ @@ -118,11 +117,5 @@ def control_categorical(self, input_data: InputData) -> bool: """ categorical_ids, _ = find_categorical_columns(input_data.features) - all_cardinality = 0 - need_label = False - for idx in categorical_ids: - all_cardinality += np.unique(input_data.features[:, idx].astype(str)).shape[0] - if all_cardinality > self.max_cat_cardinality: - need_label = True - break - return need_label + uniques = np.unique(input_data.features[:, categorical_ids].astype(str)) + return len(uniques) > self.max_cat_cardinality diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index b241e2cf43..e5565204e6 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -1,4 +1,4 @@ -from typing import Tuple, Optional, List +from typing import Tuple, Optional import numpy as np import pandas as pd @@ -48,7 +48,7 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li raise ValueError(f'{prefix} Check data for Nans and inf values') -def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[List[int]] = None): +def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.ndarray] = None): """ Method for finding categorical and non-categorical columns in tabular data @@ -63,13 +63,9 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[List[i # Define if data contains string columns for "unknown table" return force_categorical_determination(table) - categorical_ids = [] - non_categorical_ids = [] - for col_id, col_type_id in enumerate(column_type_ids): - if col_type_id == TYPE_TO_ID[str]: - categorical_ids.append(col_id) - else: - non_categorical_ids.append(col_id) + is_str = np.isin(column_type_ids, TYPE_TO_ID[str]) + categorical_ids = np.flatnonzero(is_str).tolist() + non_categorical_ids = np.flatnonzero(~is_str).tolist() return categorical_ids, non_categorical_ids diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 6213c53a75..505d2610db 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -37,12 +37,7 @@ def fit(self, input_data: InputData): """ features = input_data.features column_type_ids = input_data.supplementary_data.column_types.get('features') - categorical_ids, non_categorical_ids = find_categorical_columns(features, - column_type_ids) - - # Indices of columns with categorical and non-categorical features - self.categorical_ids = categorical_ids - self.non_categorical_ids = non_categorical_ids + self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, column_type_ids) # If there are categorical features - process it if self.categorical_ids: diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 750379bff7..fb2e59ee9e 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -1,5 +1,4 @@ from copy import deepcopy -from typing import Tuple import numpy as np import pandas as pd @@ -28,9 +27,6 @@ def fit(self, input_data: InputData): column_type_ids = input_data.supplementary_data.column_types['features'] categorical_ids, _ = find_categorical_columns(input_data.features, column_type_ids) - if len(categorical_ids) == 0: - # There is no need to process categorical features - return self binary_ids_to_convert = [] for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T): diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 0e4f699c06..34cd4cecc9 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -194,16 +194,20 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): Such columns have no conflicts with types converting. """ if self.string_columns_transformation_failed: - self.log.warning(f'Columns with indices {list(self.string_columns_transformation_failed.keys())} were ' + self.log.warning(f'Columns with indices {self.string_columns_transformation_failed} were ' f'removed during mixed types column converting due to conflicts.') data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) - data.supplementary_data.column_types['features'] = [ - col_type_id - for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features']) - if col_id not in self.string_columns_transformation_failed - ] + # data.supplementary_data.column_types['features'] = [ + # col_type_id + # for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features']) + # if col_id not in self.string_columns_transformation_failed + # ] + data.supplementary_data.column_types['features'] = np.delete( + data.supplementary_data.column_types['features'], + list(self.string_columns_transformation_failed.keys()) + ) def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list): # Check if columns number correct @@ -302,9 +306,8 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy() # Columns need to be transformed into categorical (string) ones self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) - for column_id in cat_col_ids: - # Update information about column types (in-place) - features_types[column_id] = TYPE_TO_ID[str] + # Update information about column types (in-place) + features_types[cat_col_ids] = TYPE_TO_ID[str] def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -320,16 +323,15 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] - for column_id in self.numerical_into_str: - features_types[column_id] = TYPE_TO_ID[str] + features_types[self.numerical_into_str] = TYPE_TO_ID[str] def _into_numeric_features_transformation_for_fit(self, data: InputData): """ Automatically determine categorical features which should be converted into float """ - str_columns = [ - column_id for column_id, _ in enumerate(data.features.T) - if data.supplementary_data.column_types['features'][column_id] == TYPE_TO_ID[str]] + str_columns = np.flatnonzero( + np.isin(data.supplementary_data.column_types['features'], TYPE_TO_ID[str]) + ) str_cols_df = pd.DataFrame(data.features[:, str_columns], columns=str_columns) orig_nans_cnt = str_cols_df.isna().sum(axis=0) @@ -345,9 +347,10 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): is_numeric_ids = is_numeric[is_numeric].index data.features[:, is_numeric_ids] = converted_str_cols_df[is_numeric_ids].to_numpy() self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float)) + + # Update information about column types (in-place) features_types = data.supplementary_data.column_types['features'] - for column_id in is_numeric_ids: - features_types[column_id] = TYPE_TO_ID[float] + features_types[is_numeric_ids] = TYPE_TO_ID[float] # The columns consists mostly of truly str values and has a few ints/floats in it is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1) @@ -366,8 +369,10 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): .difference(self.string_columns_transformation_failed)) str_cols_df = pd.DataFrame(data.features[:, str_cols_ids], columns=str_cols_ids) data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() - for column_id in str_cols_ids: - data.supplementary_data.column_types['features'][column_id] = TYPE_TO_ID[float] + + # Update information about column types (in-place) + features_types = data.supplementary_data.column_types['features'] + features_types[str_cols_ids] = TYPE_TO_ID[float] def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: @@ -480,25 +485,25 @@ def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: current_type=current_type) -def _generate_list_with_types(columns_types_info: pd.DataFrame, converted_columns: Dict[int, Optional[int]]) -> list: +def _generate_list_with_types(columns_types_info: pd.DataFrame, + converted_columns: Dict[int, Optional[int]]) -> np.ndarray: """ Create list with types for all remained columns :param columns_types_info: dictionary with initial column types :param converted_columns: dictionary with transformed column types """ updated_column_types = [] - for column_id, column_info in columns_types_info.items(): - column_types_ids = column_info[_TYPES] - if len(column_types_ids) == 1: + for column_id, column_type_ids in columns_types_info.loc[_TYPES].items(): + if len(column_type_ids) == 1: # Column initially contain only one type - updated_column_types.append(column_types_ids[0]) - elif len(column_types_ids) == 2 and TYPE_TO_ID[type(None)] in column_types_ids: + updated_column_types.append(column_type_ids[0]) + elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids: # Column with one type and nans - filtered_types = [x for x in column_types_ids if x != TYPE_TO_ID[type(None)]] + filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]] updated_column_types.append(filtered_types[0]) else: - if TYPE_TO_ID[str] in column_types_ids: + if TYPE_TO_ID[str] in column_type_ids: # Mixed-types column with string new_col_id = converted_columns[column_id] if new_col_id is not None: @@ -507,7 +512,7 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame, converted_column # Mixed-types with float and integer updated_column_types.append(TYPE_TO_ID[float]) - return updated_column_types + return np.array(updated_column_types) def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type): From 1c044eeeffbe27ae4921a1f38e443f4eeb44cfcb Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 15 May 2023 17:32:39 +0300 Subject: [PATCH 35/72] numpy arr extend fix --- .../data_operations/sklearn_transformations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 530eb84320..bb5d2e95d4 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -197,8 +197,8 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): if cols_number_added > 0: # There are new columns in the table col_types = output_data.supplementary_data.column_types['features'] - col_types += [TYPE_TO_ID[float]] * cols_number_added - output_data.supplementary_data.column_types['features'] = col_types + new_types = [TYPE_TO_ID[float]] * cols_number_added + output_data.supplementary_data.column_types['features'] = np.append(col_types, new_types) class ScalingImplementation(EncodedInvariantImplementation): From 71560dff597931495fb162e2752f8588c1c60dea Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 15 May 2023 18:22:02 +0300 Subject: [PATCH 36/72] data_types.py cleanup --- fedot/preprocessing/data_types.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 34cd4cecc9..f354f39ab0 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,7 +1,7 @@ from __future__ import annotations from copy import copy -from typing import TYPE_CHECKING, Tuple, Optional, List, Dict +from typing import TYPE_CHECKING, Tuple, Optional, List, Dict, Sequence import numpy as np import pandas as pd @@ -199,14 +199,9 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) - # data.supplementary_data.column_types['features'] = [ - # col_type_id - # for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features']) - # if col_id not in self.string_columns_transformation_failed - # ] data.supplementary_data.column_types['features'] = np.delete( data.supplementary_data.column_types['features'], - list(self.string_columns_transformation_failed.keys()) + list(self.string_columns_transformation_failed) ) def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list): @@ -422,7 +417,7 @@ def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> return frame.loc[:, _cols_have_any] -def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter): +def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: LoggerAdapter): """ Apply transformation for columns in dataset into desired type. Perform transformation on predict stage when column types were already determined @@ -435,8 +430,7 @@ def type_by_id(current_type_id: int): return int elif current_type_id == TYPE_TO_ID[str]: return str - else: - return float + return float if table is None: # Occurs if for predict stage there is no target info @@ -470,12 +464,12 @@ def _obtain_new_column_type(column_info: pd.Series): def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray, - column_id: int, current_type, log: LoggerAdapter): + column_id: int, current_type: type, log: LoggerAdapter): try: table[:, column_id] = current_column.astype(current_type) if current_type is str: - is_any_comma = any(map(lambda el: ',' in el, current_column)) - is_any_dot = any(map(lambda el: '.' in el, current_column)) + is_any_comma = any(',' in el for el in current_column) + is_any_dot = any('.' in el for el in current_column) # Most likely case: '20,000' must be converted into '20.000' if is_any_comma and is_any_dot: warning = f'Column {column_id} contains both "." and ",". Standardize it.' @@ -515,21 +509,20 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame, return np.array(updated_column_types) -def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type): +def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type: type): """ Process column values one by one and try to convert them into desirable type. If not successful replace with np.nan """ def _process_str_numbers_with_dots_and_commas(value: str): """ Try to process str with replacing ',' by '.' in case it was meant to be a number """ value = value.replace(',', '.') + new_value = np.nan try: # Since "10.6" can not be converted to 10 straightforward using int() if current_type is int: new_value = int(float(value)) - else: - new_value = current_type(value) except ValueError: - return np.nan + pass return new_value new_column = [] From 9e89a927c0f5c7be9160e6360052a7a25c4196d0 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 15 May 2023 18:36:25 +0300 Subject: [PATCH 37/72] lint fixes --- .../models/discriminant_analysis.py | 1 - .../repository/operation_types_repository.py | 2 +- fedot/preprocessing/base_preprocessing.py | 24 +++++++++---------- fedot/preprocessing/data_types.py | 14 +++++------ fedot/preprocessing/dummy_preprocessing.py | 24 +++++++++---------- test/unit/data/test_supplementary_data.py | 2 ++ 6 files changed, 34 insertions(+), 33 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py index 317e3d41a4..a0d7bf2a86 100644 --- a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py +++ b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py @@ -1,7 +1,6 @@ from typing import Optional import numpy as np -import pandas as pd from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ModelImplementation diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py index d300f10f15..7e42d95e60 100644 --- a/fedot/core/repository/operation_types_repository.py +++ b/fedot/core/repository/operation_types_repository.py @@ -224,7 +224,7 @@ def get_strategies_by_metadata(metadata: dict) -> Union['EvaluationStrategy', Di Args: metadata: information about meta of the operation - + Returns: available strategies for current metadata """ diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index a3244559c7..ae0ef29140 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -54,8 +54,8 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) -> raise AbstractMethodNotImplementError @abstractmethod - def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[InputData, - MultiModalData]: + def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: """ Performs obligatory preprocessing for pipeline's predict method. @@ -68,8 +68,8 @@ def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) raise AbstractMethodNotImplementError @abstractmethod - def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[InputData, - MultiModalData]: + def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: """ Launches preprocessing operations if it is necessary for pipeline fitting. @@ -83,8 +83,8 @@ def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalDa raise AbstractMethodNotImplementError @abstractmethod - def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[InputData, - MultiModalData]: + def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: """ Launches preprocessing operations if it is necessary for pipeline predict stage. Preprocessor must be already fitted. @@ -135,8 +135,8 @@ def apply_inverse_target_encoding(self, column_to_transform: np.ndarray) -> np.n raise AbstractMethodNotImplementError @abstractmethod - def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData]) -> \ - Union[InputData, MultiModalData]: + def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: """ Converts provided data's and pipeline's indexes for fit @@ -150,8 +150,8 @@ def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, M raise AbstractMethodNotImplementError @abstractmethod - def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> \ - Union[InputData, MultiModalData]: + def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: """ Converts provided data's and pipeline's indexes for predict @@ -178,8 +178,8 @@ def restore_index(self, input_data: InputData, result: OutputData) -> OutputData raise AbstractMethodNotImplementError @abstractmethod - def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]) -> Union[InputData, - MultiModalData]: + def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: """ Replaces indices for time series for predict stage diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index f354f39ab0..3feb294ab3 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -354,8 +354,8 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): # If column contains a lot of '?' or 'x' as nans equivalents # add it remove list is_of_mistakes = ( - (self.acceptable_failed_rate_bottom <= failed_ratio) - & (failed_ratio < self.acceptable_failed_rate_top)) + (self.acceptable_failed_rate_bottom <= failed_ratio) & + (failed_ratio < self.acceptable_failed_rate_top)) self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index)) def _into_numeric_features_transformation_for_predict(self, data: InputData): @@ -392,11 +392,11 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: } types_counts = ( table_of_types - .apply(pd.value_counts, dropna=False) - .reindex(counts_index_mapper.keys(), copy=False) - .replace(np.nan, 0) - .rename(index=counts_index_mapper, copy=False) - .astype(int) + .apply(pd.value_counts, dropna=False) + .reindex(counts_index_mapper.keys(), copy=False) + .replace(np.nan, 0) + .rename(index=counts_index_mapper, copy=False) + .astype(int) ) # Build dataframe with nans indices diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py index b088938c6c..36b76a390c 100644 --- a/fedot/preprocessing/dummy_preprocessing.py +++ b/fedot/preprocessing/dummy_preprocessing.py @@ -25,18 +25,18 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) -> BasePreprocessor.mark_as_preprocessed(data) return data - def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: BasePreprocessor.mark_as_preprocessed(data) return data - def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: BasePreprocessor.mark_as_preprocessed(data, is_obligatory=False) return data - def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: BasePreprocessor.mark_as_preprocessed(data, is_obligatory=False) return data @@ -49,17 +49,17 @@ def cut_dataset(self, data: InputData, border: int): def apply_inverse_target_encoding(self, column_to_transform: np.ndarray) -> np.ndarray: return column_to_transform - def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: return data - def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: return data def restore_index(self, input_data: InputData, result: OutputData) -> OutputData: return result - def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]) -> Union[ - InputData, MultiModalData]: + def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData] + ) -> Union[InputData, MultiModalData]: return test_data diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py index 5d5581139e..152d62636d 100644 --- a/test/unit/data/test_supplementary_data.py +++ b/test/unit/data/test_supplementary_data.py @@ -12,6 +12,8 @@ from fedot.preprocessing.data_types import TYPE_TO_ID from test.unit.tasks.test_regression import get_synthetic_regression_data +from test.unit.data.test_data_merge import unequal_outputs_table # noqa, fixture + @pytest.fixture() def outputs_table_with_different_types(): From b91a993441b8905d679d028264042f9ca960f6d1 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 15 May 2023 18:39:18 +0300 Subject: [PATCH 38/72] lint fixes (v2) --- test/unit/data/test_supplementary_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py index 152d62636d..0318f2e19d 100644 --- a/test/unit/data/test_supplementary_data.py +++ b/test/unit/data/test_supplementary_data.py @@ -47,7 +47,7 @@ def generate_straight_pipeline(): return pipeline -def test_parent_mask_correct(unequal_outputs_table): +def test_parent_mask_correct(unequal_outputs_table): # noqa, fixture """ Test correctness of function for tables mask generation """ correct_parent_mask = {'input_ids': [0, 1], 'flow_lens': [1, 0]} From 74fe8b8f9712f736af65830ea8a828cb66442a0a Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 16 May 2023 18:35:31 +0300 Subject: [PATCH 39/72] supp_data typing upd --- fedot/core/data/supplementary_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py index 2456617c75..d4238818e8 100644 --- a/fedot/core/data/supplementary_data.py +++ b/fedot/core/data/supplementary_data.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional +from typing import Optional, Dict import numpy as np @@ -28,7 +28,7 @@ class SupplementaryData: # Collection with non-int indexes non_int_idx: Optional[list] = None # Dictionary with features and target column types - column_types: Optional[dict] = None + column_types: Optional[Dict[str, np.ndarray]] = None @property def compound_mask(self): From d3534d07c42892c35ff04295b128f1afe157937f Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 16 May 2023 19:00:24 +0300 Subject: [PATCH 40/72] ensure all column_types are of ndarray type --- .../data/merge/supplementary_data_merger.py | 11 ++++---- .../data_operations/categorical_encoders.py | 15 ++++------- .../data_operations/sklearn_selectors.py | 18 ++++++------- .../sklearn_transformations.py | 6 ++--- .../data_operations/ts_transformations.py | 11 ++++---- fedot/core/operations/model.py | 22 ++++++++++------ fedot/preprocessing/categorical.py | 5 ++-- fedot/preprocessing/data_types.py | 23 ++++++++-------- fedot/preprocessing/preprocessing.py | 2 +- test/unit/data/test_supplementary_data.py | 11 ++++---- .../test_data_operations_implementations.py | 26 ++++++++++--------- .../test_preprocessing_through_api.py | 2 +- test/unit/preprocessing/test_preprocessors.py | 12 ++++----- 13 files changed, 81 insertions(+), 83 deletions(-) diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py index 1d1c200cae..96cd509460 100644 --- a/fedot/core/data/merge/supplementary_data_merger.py +++ b/fedot/core/data/merge/supplementary_data_merger.py @@ -1,5 +1,6 @@ from typing import List, Dict +import numpy as np from golem.core.log import default_log from fedot.core.data.data import OutputData @@ -83,7 +84,7 @@ def merge_column_types(self) -> Dict: # Concatenate types for features columns and # choose target type of the main target as the new target type - new_features_types = [] + new_feature_types = [] new_target_types = None for output in self.outputs: if output.supplementary_data.column_types is None: @@ -92,12 +93,10 @@ def merge_column_types(self) -> Dict: output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict, output.target, output.task) - col_types = output.supplementary_data.column_types['features'] - new_features_types.extend(col_types) + feature_types = output.supplementary_data.column_types['features'] + new_feature_types.extend(feature_types) if output.supplementary_data.is_main_target: # Target can be None for predict stage new_target_types = output.supplementary_data.column_types.get('target') - - column_types = {'features': new_features_types, 'target': new_target_types} - return column_types + return {'features': np.array(new_feature_types), 'target': new_target_types} diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 505d2610db..7d9894cc5a 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -74,12 +74,12 @@ def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical columns becomes integer with extension """ if self.categorical_ids: # There are categorical features in the table - col_types = output_data.supplementary_data.column_types['features'] - numerical_columns = [t_name for t_name in col_types if t_name != TYPE_TO_ID[str]] + feature_types = output_data.supplementary_data.column_types['features'] + numerical_columns = feature_types[np.isin(feature_types, TYPE_TO_ID[str], invert=True)] # Calculate new binary columns number after encoding encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns) - numerical_columns += [TYPE_TO_ID[int]] * encoded_columns_number + numerical_columns = np.append(numerical_columns, [TYPE_TO_ID[int]] * encoded_columns_number) output_data.encoded_idx = self.encoded_ids output_data.supplementary_data.column_types['features'] = numerical_columns @@ -146,13 +146,8 @@ def transform(self, input_data: InputData) -> OutputData: def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical becomes integer """ - if self.categorical_ids: - # Categorical features were in the dataset - col_types = output_data.supplementary_data.column_types['features'] - for categorical_id in self.categorical_ids: - col_types[categorical_id] = TYPE_TO_ID[int] - - output_data.supplementary_data.column_types['features'] = col_types + feature_types = output_data.supplementary_data.column_types['features'] + feature_types[self.categorical_ids] = TYPE_TO_ID[int] def _fit_label_encoders(self, input_data: InputData): """ Fit LabelEncoder for every categorical column in the dataset """ diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py index aa052d7a51..23c56329e1 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py @@ -77,16 +77,14 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): """ Update column types after applying feature selection operations """ if len(source_features_shape) < 2: return output_data - else: - if self.features_columns_number > 1: - cols_number_removed = source_features_shape[1] - output_data.predict.shape[1] - if cols_number_removed > 0: - # There are several columns, which were dropped - col_types = output_data.supplementary_data.column_types['features'] - - # Calculate - remained_column_types = np.array(col_types)[self.remain_features_mask] - output_data.supplementary_data.column_types['features'] = list(remained_column_types) + if self.features_columns_number > 1: + cols_number_removed = source_features_shape[1] - output_data.predict.shape[1] + if cols_number_removed: + # There are several columns, which were dropped + feature_types = output_data.supplementary_data.column_types['features'] + + # Calculate + output_data.supplementary_data.column_types['features'] = feature_types[self.remain_features_mask] def _make_new_table(self, features): """ diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index bb5d2e95d4..f94103f805 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -89,7 +89,7 @@ def update_column_types(output_data: OutputData) -> OutputData: """ _, n_cols = output_data.predict.shape - output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float]] * n_cols + output_data.supplementary_data.column_types['features'] = np.array([TYPE_TO_ID[float]] * n_cols) return output_data @@ -196,9 +196,9 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): cols_number_added = output_data.predict.shape[1] - source_features_shape[1] if cols_number_added > 0: # There are new columns in the table - col_types = output_data.supplementary_data.column_types['features'] + feature_types = output_data.supplementary_data.column_types['features'] new_types = [TYPE_TO_ID[float]] * cols_number_added - output_data.supplementary_data.column_types['features'] = np.append(col_types, new_types) + output_data.supplementary_data.column_types['features'] = np.append(feature_types, new_types) class ScalingImplementation(EncodedInvariantImplementation): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py index 74dd395fdb..67ce85ff3d 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py @@ -127,13 +127,14 @@ def _update_column_types(self, output_data: OutputData): """Update column types after lagged transformation. All features becomes ``float`` """ - features_n_rows, features_n_cols = output_data.predict.shape - features_column_types = [TYPE_TO_ID[float]] * features_n_cols - column_types = {'features': features_column_types} + _, features_n_cols = output_data.predict.shape + feature_types = np.array([TYPE_TO_ID[float]] * features_n_cols) + column_types = {'features': feature_types} if output_data.target is not None and len(output_data.target.shape) > 1: - target_n_rows, target_n_cols = output_data.target.shape - column_types.update({'target': [TYPE_TO_ID[float]] * target_n_cols}) + _, target_n_cols = output_data.target.shape + target_types = np.array([TYPE_TO_ID[float]] * target_n_cols) + column_types['target'] = target_types output_data.supplementary_data.column_types = column_types def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array, diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py index 9e38f5e27c..250016fc74 100644 --- a/fedot/core/operations/model.py +++ b/fedot/core/operations/model.py @@ -35,33 +35,39 @@ def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> Ou # Add information about features if is_regression_task or is_ts_forecasting_task: if len(predict_shape) < 2: - column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} + column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} else: - column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} else: if len(predict_shape) < 2: output_data.predict = output_data.predict.reshape((-1, 1)) predict_shape = output_data.predict.shape # Classification task or clustering target_type = int if output_mode == 'labels' else float - column_info = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]} + column_types = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]} + + # Make feature types static to suit supplementary data contract + column_types['features'] = np.array(column_types['features']) # Add information about target target_shape = output_data.target.shape if output_data.target is not None else None if target_shape is None: # There is no target column in output data - output_data.supplementary_data.column_types = column_info + output_data.supplementary_data.column_types = column_types return output_data if is_regression_task or is_ts_forecasting_task: if len(target_shape) > 1: - column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]}) + column_types['target'] = [TYPE_TO_ID[float]] * target_shape[1] else: # Array present "time series" - column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)}) + column_types['target'] = [TYPE_TO_ID[float]] * len(output_data.target) else: # Classification task or clustering - column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]}) + column_types['target'] = [TYPE_TO_ID[int]] * predict_shape[1] + + # Make target types static to suit supplementary data contract + column_types['target'] = np.array(column_types['target']) - output_data.supplementary_data.column_types = column_info + output_data.supplementary_data.column_types = column_types return output_data diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index fb2e59ee9e..10e7abb339 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -63,9 +63,8 @@ def transform(self, input_data: InputData) -> InputData: self._apply_encoder(copied_data.features) # Update features types - features_types = copied_data.supplementary_data.column_types['features'] - for converted_column_id in self.binary_ids_to_convert: - features_types[converted_column_id] = TYPE_TO_ID[int] + feature_types = copied_data.supplementary_data.column_types['features'] + feature_types[self.binary_ids_to_convert] = TYPE_TO_ID[int] return copied_data def fit_transform(self, input_data: InputData) -> InputData: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 3feb294ab3..95d1abb6db 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,6 +1,5 @@ from __future__ import annotations -from copy import copy from typing import TYPE_CHECKING, Tuple, Optional, List, Dict, Sequence import numpy as np @@ -93,8 +92,8 @@ def convert_data_for_fit(self, data: InputData): # Launch conversion float and integer features into categorical self._into_categorical_features_transformation_for_fit(data) # Save info about features and target types - self.features_types = copy(data.supplementary_data.column_types['features']) - self.target_types = copy(data.supplementary_data.column_types['target']) + self.features_types = data.supplementary_data.column_types['features'].copy() + self.target_types = data.supplementary_data.column_types['target'].copy() self._retain_columns_info_without_types_conflicts(data) return data @@ -289,8 +288,8 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - features_types = data.supplementary_data.column_types['features'] - is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) + feature_types = data.supplementary_data.column_types['features'] + is_numeric_type = np.isin(feature_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) numeric_type_ids = np.flatnonzero(is_numeric_type) num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) nuniques = num_df.nunique(dropna=True) @@ -302,7 +301,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): # Columns need to be transformed into categorical (string) ones self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) # Update information about column types (in-place) - features_types[cat_col_ids] = TYPE_TO_ID[str] + feature_types[cat_col_ids] = TYPE_TO_ID[str] def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -317,8 +316,8 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy() # Update information about column types (in-place) - features_types = data.supplementary_data.column_types['features'] - features_types[self.numerical_into_str] = TYPE_TO_ID[str] + feature_types = data.supplementary_data.column_types['features'] + feature_types[self.numerical_into_str] = TYPE_TO_ID[str] def _into_numeric_features_transformation_for_fit(self, data: InputData): """ @@ -344,8 +343,8 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float)) # Update information about column types (in-place) - features_types = data.supplementary_data.column_types['features'] - features_types[is_numeric_ids] = TYPE_TO_ID[float] + feature_types = data.supplementary_data.column_types['features'] + feature_types[is_numeric_ids] = TYPE_TO_ID[float] # The columns consists mostly of truly str values and has a few ints/floats in it is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1) @@ -366,8 +365,8 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() # Update information about column types (in-place) - features_types = data.supplementary_data.column_types['features'] - features_types[str_cols_ids] = TYPE_TO_ID[float] + feature_types = data.supplementary_data.column_types['features'] + feature_types[str_cols_ids] = TYPE_TO_ID[float] def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 5186a1b84c..1c2242e744 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -428,7 +428,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra encoded_target = data.target if encoder is not None: # Target encoders have already been fitted - data.supplementary_data.column_types['target'] = [TYPE_TO_ID[int]] + data.supplementary_data.column_types['target'] = np.array([TYPE_TO_ID[int]]) encoded_target = encoder.transform(encoded_target) if len(encoded_target.shape) == 1: encoded_target = encoded_target.reshape((-1, 1)) diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py index 0318f2e19d..b1447a1b85 100644 --- a/test/unit/data/test_supplementary_data.py +++ b/test/unit/data/test_supplementary_data.py @@ -10,9 +10,8 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.preprocessing.data_types import TYPE_TO_ID -from test.unit.tasks.test_regression import get_synthetic_regression_data - from test.unit.data.test_data_merge import unequal_outputs_table # noqa, fixture +from test.unit.tasks.test_regression import get_synthetic_regression_data @pytest.fixture() @@ -21,15 +20,15 @@ def outputs_table_with_different_types(): task = Task(TaskTypesEnum.regression) idx = [0, 1, 2] target = [1, 2, 10] - data_info_first = SupplementaryData(column_types={'features': [TYPE_TO_ID[str], TYPE_TO_ID[float]], - 'target': [TYPE_TO_ID[int]]}) + data_info_first = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str], TYPE_TO_ID[float]]), + 'target': np.array([TYPE_TO_ID[int]])}) output_first = OutputData(idx=idx, features=None, predict=np.array([['a', 1.1], ['b', 2], ['c', 3]], dtype=object), task=task, target=target, data_type=DataTypesEnum.table, supplementary_data=data_info_first) - data_info_second = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]], - 'target': [TYPE_TO_ID[int]]}) + data_info_second = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]]), + 'target': np.array([TYPE_TO_ID[int]])}) output_second = OutputData(idx=idx, features=None, predict=np.array([[2.5], [2.1], [9.3]], dtype=float), task=task, target=target, data_type=DataTypesEnum.table, diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index 8910f154c0..fac04125ae 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -129,7 +129,7 @@ def get_multivariate_time_series(mutli_ts=False): def get_nan_inf_data(): - supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]] * 4}) + supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]] * 4)}) train_input = InputData(idx=[0, 1, 2, 3], features=np.array([[1, 2, 3, 4], [2, np.nan, 4, 5], @@ -144,8 +144,8 @@ def get_nan_inf_data(): def get_single_feature_data(task=None): - supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int]], - 'target': [TYPE_TO_ID[int]]}) + supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int]]), + 'target': np.array([TYPE_TO_ID[int]])}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=np.array([[1], [2], [3], [7], [8], [9]]), target=np.array([[0], [0], [0], [1], [1], [1]]), @@ -168,10 +168,11 @@ def get_mixed_data(task=None, extended=False): [np.nan, np.nan, '1', np.nan, '2', 'not blue', 'di'], [8, '1', '1', 0, '1', 'not blue', 'da bu'], [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object) - features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], - TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]] + features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], + TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]) + target_types = np.array([TYPE_TO_ID[int]]) supp_data = SupplementaryData(column_types={'features': features_types, - 'target': [TYPE_TO_ID[int]]}) + 'target': target_types}) else: features = np.array([[1, '0', 1], [2, '1', 0], @@ -179,9 +180,10 @@ def get_mixed_data(task=None, extended=False): [7, '1', 1], [8, '1', 1], [9, '0', 0]], dtype=object) - features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]] + features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) + target_types = np.array([TYPE_TO_ID[int]]) supp_data = SupplementaryData(column_types={'features': features_types, - 'target': [TYPE_TO_ID[int]]}) + 'target': target_types}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features, @@ -200,7 +202,7 @@ def get_nan_binary_data(task=None): Binary int columns must be processed as "almost categorical". Current dataset For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ - features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]] + features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) supp_data = SupplementaryData(column_types={'features': features_types}) features = np.array([[1, '0', 0], [np.nan, np.nan, np.nan], @@ -231,8 +233,8 @@ def get_unbalanced_dataset(size=10, disbalance=0.4, target_dim=None): target = target.reshape(-1, 1) supp_data = SupplementaryData(column_types={ - 'features': [TYPE_TO_ID[int], TYPE_TO_ID[str]], - 'target': [TYPE_TO_ID[int]] + 'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[str]]), + 'target': np.array([TYPE_TO_ID[int]]) }) input_data = InputData(idx=np.arange(features.shape[0]), @@ -251,7 +253,7 @@ def data_with_binary_int_features_and_equal_categories(): must be processed as "almost categorical". Current dataset For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ - supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int], TYPE_TO_ID[int]]}) + supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[int]])}) task = Task(TaskTypesEnum.classification) features = np.array([[1, 10], [np.nan, np.nan], diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py index 3b0e60fc25..5726b041df 100644 --- a/test/unit/preprocessing/test_preprocessing_through_api.py +++ b/test/unit/preprocessing/test_preprocessing_through_api.py @@ -11,7 +11,7 @@ def data_with_only_categorical_features(): """ Generate tabular data with only categorical features. All of them are binary. """ - supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[str]] * 3}) + supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str]] * 3)}) task = Task(TaskTypesEnum.regression) features = np.array([["'a'", "0", "1"], ["'b'", "1", "0"], diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 562e821a3e..5cdab850d4 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -129,15 +129,15 @@ def test_column_types_converting_correctly(): types_corr = TableTypesCorrector() data = types_corr.convert_data_for_fit(data) - features_types = data.supplementary_data.column_types['features'] + feature_types = data.supplementary_data.column_types['features'] target_types = data.supplementary_data.column_types['target'] - assert len(features_types) == 4 + assert len(feature_types) == 4 assert len(target_types) == 2 - assert features_types[0] == TYPE_TO_ID[str] - assert features_types[1] == TYPE_TO_ID[str] - assert features_types[2] == TYPE_TO_ID[str] - assert target_types[0] == target_types[1] == TYPE_TO_ID[str] + assert feature_types[0] == TYPE_TO_ID[str] + assert feature_types[1] == TYPE_TO_ID[str] + assert feature_types[2] == TYPE_TO_ID[str] + assert (target_types == TYPE_TO_ID[str]).all() def test_column_types_process_correctly(): From 08e221e911b76fe43d6ab349ff96a8f1a96ae064 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 17 May 2023 14:30:01 +0300 Subject: [PATCH 41/72] column types naming fix --- fedot/core/data/data_preprocessing.py | 2 +- .../data/merge/supplementary_data_merger.py | 2 +- .../data_operations/categorical_encoders.py | 8 ++--- .../sklearn_transformations.py | 4 +-- fedot/preprocessing/categorical.py | 4 +-- fedot/preprocessing/data_types.py | 30 +++++++++---------- fedot/preprocessing/preprocessing.py | 2 +- test/unit/data/test_supplementary_data.py | 10 +++---- .../test_data_operations_implementations.py | 22 +++++++------- test/unit/preprocessing/test_preprocessors.py | 6 ++-- 10 files changed, 45 insertions(+), 45 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index e5565204e6..40e2d23f42 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -54,7 +54,7 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda Args: table: tabular data for string columns types determination. - column_type_ids: list with column types. If None, perform default checking. + column_type_ids: list with column type ids. If None, perform default checking. Returns: categorical_ids: indices of categorical columns in table. non_categorical_ids: indices of non categorical columns in table. diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py index 96cd509460..dc16538e08 100644 --- a/fedot/core/data/merge/supplementary_data_merger.py +++ b/fedot/core/data/merge/supplementary_data_merger.py @@ -76,7 +76,7 @@ def prepare_parent_mask(self) -> Dict: features_mask = {'input_ids': input_ids, 'flow_lens': flow_lens} return features_mask - def merge_column_types(self) -> Dict: + def merge_column_types(self) -> Dict[str, np.ndarray]: """ Store information about column types in tabular data for merged data """ if self.main_output.data_type is not DataTypesEnum.table: # Data is not tabular diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 7d9894cc5a..55071ae5f2 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -36,8 +36,8 @@ def fit(self, input_data: InputData): :return encoder: trained encoder (optional output) """ features = input_data.features - column_type_ids = input_data.supplementary_data.column_types.get('features') - self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, column_type_ids) + feature_type_ids = input_data.supplementary_data.column_types['features'] + self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, feature_type_ids) # If there are categorical features - process it if self.categorical_ids: @@ -119,9 +119,9 @@ def __init__(self, params: Optional[OperationParameters] = None): self.non_categorical_ids: List[int] = None def fit(self, input_data: InputData): - column_type_ids = input_data.supplementary_data.column_types.get('features') + feature_type_ids = input_data.supplementary_data.column_types['features'] self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, - column_type_ids) + feature_type_ids) # If there are categorical features - process it if self.categorical_ids: diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index f94103f805..05d824e320 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -290,9 +290,9 @@ def transform(self, input_data: InputData) -> OutputData: replace_inf_with_nans(input_data) if data_type_is_table(input_data) and data_has_categorical_features(input_data): - column_type_ids = input_data.supplementary_data.column_types.get('features') + feature_type_ids = input_data.supplementary_data.column_types['features'] self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, - column_type_ids) + feature_type_ids) numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids, self.non_categorical_ids) diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 10e7abb339..162611c28c 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -24,9 +24,9 @@ def fit(self, input_data: InputData): Find indices of columns which are contains categorical values. Binary features and at the same time has str objects. If there are such features - convert it into int """ - column_type_ids = input_data.supplementary_data.column_types['features'] + feature_type_ids = input_data.supplementary_data.column_types['features'] categorical_ids, _ = find_categorical_columns(input_data.features, - column_type_ids) + feature_type_ids) binary_ids_to_convert = [] for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T): diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 95d1abb6db..45bfca42bc 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -12,9 +12,9 @@ from fedot.core.data.data import InputData _convertable_types = (bool, float, int, str, type(None)) # preserve lexicographical order -_types_ids = range(len(_convertable_types)) +_type_ids = range(len(_convertable_types)) -TYPE_TO_ID = dict(zip(_convertable_types, _types_ids)) +TYPE_TO_ID = dict(zip(_convertable_types, _type_ids)) _TYPES = 'types' _FLOAT_NUMBER = 'float_number' @@ -64,8 +64,8 @@ def __init__(self): self.target_converting_has_errors = False # Lists with column types for converting calculated on source input data - self.features_types = None - self.target_types = None + self.feature_type_ids = None + self.target_type_ids = None self.log = default_log(self) def convert_data_for_fit(self, data: InputData): @@ -78,7 +78,7 @@ def convert_data_for_fit(self, data: InputData): self.target_columns_info = define_column_types(data.target) # Correct types in features table - data.features = self.features_types_converting(features=data.features) + data.features = self.feature_types_converting(features=data.features) # Remain only correct columns data.features = self.remove_incorrect_features(data.features, self.features_converted_columns) @@ -92,8 +92,8 @@ def convert_data_for_fit(self, data: InputData): # Launch conversion float and integer features into categorical self._into_categorical_features_transformation_for_fit(data) # Save info about features and target types - self.features_types = data.supplementary_data.column_types['features'].copy() - self.target_types = data.supplementary_data.column_types['target'].copy() + self.feature_type_ids = data.supplementary_data.column_types['features'].copy() + self.target_type_ids = data.supplementary_data.column_types.get('target', np.array()).copy() self._retain_columns_info_without_types_conflicts(data) return data @@ -103,8 +103,8 @@ def convert_data_for_predict(self, data: InputData): # Ordering is important because after removing incorrect features - indices are obsolete data.features = data.features.astype(object) data.features = self.remove_incorrect_features(data.features, self.features_converted_columns) - data.features = apply_type_transformation(data.features, self.features_types, self.log) - data.target = apply_type_transformation(data.target, self.target_types, self.log) + data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log) + data.target = apply_type_transformation(data.target, self.target_type_ids, self.log) data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) @@ -126,7 +126,7 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict): table = np.delete(table, self.columns_to_del, 1) return table - def features_types_converting(self, features: np.ndarray) -> np.ndarray: + def feature_types_converting(self, features: np.ndarray) -> np.ndarray: """ Convert all elements in the data in every feature column into one type :param features: tabular features array @@ -173,20 +173,20 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = if self.features_columns_info.empty: # Information about column types is empty - there is a need to launch algorithm to collect info self.features_columns_info = define_column_types(predictors) - predictors = self.features_types_converting(features=predictors) + predictors = self.feature_types_converting(features=predictors) if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting: self.target_columns_info = define_column_types(target) target = self.target_types_converting(target=target, task=task) - features_types = _generate_list_with_types(self.features_columns_info, self.features_converted_columns) - self._check_columns_vs_types_number(predictors, features_types) + feature_types = _generate_list_with_types(self.features_columns_info, self.features_converted_columns) + self._check_columns_vs_types_number(predictors, feature_types) if target is None or task.task_type is TaskTypesEnum.ts_forecasting: - return {'features': features_types} + return {'features': feature_types} else: target_types = _generate_list_with_types(self.target_columns_info, self.target_converted_columns) self._check_columns_vs_types_number(target, target_types) - return {'features': features_types, 'target': target_types} + return {'features': feature_types, 'target': target_types} def _retain_columns_info_without_types_conflicts(self, data: InputData): """ Update information in supplementary info - retain info only about remained columns. diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 1c2242e744..3ad213c912 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -402,7 +402,7 @@ def _train_target_encoder(self, data: InputData, source_name: str): data: data to be encoded source_name: name of the data source node """ - categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.column_types['target']) + categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.column_types.get('target')) if categorical_ids: # Target is categorical diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py index b1447a1b85..7c768392b5 100644 --- a/test/unit/data/test_supplementary_data.py +++ b/test/unit/data/test_supplementary_data.py @@ -118,11 +118,11 @@ def test_define_types_after_merging(outputs_table_with_different_types): merged_data = DataMerger.get(outputs).merge() updated_info = merged_data.supplementary_data - features_types = updated_info.column_types['features'] - target_types = updated_info.column_types['target'] + feature_type_ids = updated_info.column_types['features'] + target_type_ids = updated_info.column_types['target'] # Target type must stay the same ancestor_target_type = outputs[0].supplementary_data.column_types['target'][0] - assert target_types[0] == ancestor_target_type - assert len(features_types) == 3 - assert tuple(features_types) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float]) + assert target_type_ids[0] == ancestor_target_type + assert len(feature_type_ids) == 3 + assert tuple(feature_type_ids) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float]) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index fac04125ae..9064781164 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -168,11 +168,11 @@ def get_mixed_data(task=None, extended=False): [np.nan, np.nan, '1', np.nan, '2', 'not blue', 'di'], [8, '1', '1', 0, '1', 'not blue', 'da bu'], [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object) - features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], - TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]) - target_types = np.array([TYPE_TO_ID[int]]) - supp_data = SupplementaryData(column_types={'features': features_types, - 'target': target_types}) + feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], + TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]) + target_type_ids = np.array([TYPE_TO_ID[int]]) + supp_data = SupplementaryData(column_types={'features': feature_type_ids, + 'target': target_type_ids}) else: features = np.array([[1, '0', 1], [2, '1', 0], @@ -180,10 +180,10 @@ def get_mixed_data(task=None, extended=False): [7, '1', 1], [8, '1', 1], [9, '0', 0]], dtype=object) - features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) - target_types = np.array([TYPE_TO_ID[int]]) - supp_data = SupplementaryData(column_types={'features': features_types, - 'target': target_types}) + feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) + target_type_ids = np.array([TYPE_TO_ID[int]]) + supp_data = SupplementaryData(column_types={'features': feature_type_ids, + 'target': target_type_ids}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features, @@ -202,8 +202,8 @@ def get_nan_binary_data(task=None): Binary int columns must be processed as "almost categorical". Current dataset For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ - features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) - supp_data = SupplementaryData(column_types={'features': features_types}) + feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) + supp_data = SupplementaryData(column_types={'features': feature_type_ids}) features = np.array([[1, '0', 0], [np.nan, np.nan, np.nan], [0, '2', 1], diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 5cdab850d4..afb64661a3 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -156,10 +156,10 @@ def test_column_types_process_correctly(): pipeline.fit(train_data) predicted = pipeline.predict(test_data) - features_types_ids = predicted.supplementary_data.column_types['features'] - assert len(features_types_ids) == predicted.predict.shape[1] + feature_type_ids = predicted.supplementary_data.column_types['features'] + assert len(feature_type_ids) == predicted.predict.shape[1] # All output values are float - assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_types_ids) + assert (feature_type_ids == TYPE_TO_ID[float]).all() def test_complicated_table_types_processed_correctly(): From f9e47cf471c79688209abd1d1c3ef2584ba794ac Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 17 May 2023 16:42:07 +0300 Subject: [PATCH 42/72] remove unused f-string signs --- fedot/core/data/merge/supplementary_data_merger.py | 2 +- fedot/core/data/supplementary_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py index dc16538e08..866f450cc2 100644 --- a/fedot/core/data/merge/supplementary_data_merger.py +++ b/fedot/core/data/merge/supplementary_data_merger.py @@ -88,7 +88,7 @@ def merge_column_types(self) -> Dict[str, np.ndarray]: new_target_types = None for output in self.outputs: if output.supplementary_data.column_types is None: - self.log.debug(f'Perform determination of column types in DataMerger') + self.log.debug('Perform determination of column types in DataMerger') table_corr = TableTypesCorrector() output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict, output.target, diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py index d4238818e8..bd9f8d8881 100644 --- a/fedot/core/data/supplementary_data.py +++ b/fedot/core/data/supplementary_data.py @@ -53,7 +53,7 @@ def define_parents(self, unique_features_masks: np.array, task: TaskTypesEnum): :param task: task to solve """ if not isinstance(self.previous_operations, list) or len(self.previous_operations) == 1: - raise ValueError(f'Data was received from one node but at least two nodes are required') + raise ValueError('Data was received from one node but at least two nodes are required') data_operations = OperationTypesRepository('data_operation').suitable_operation(task_type=task) From 415bb0b242cf490a14026a736d5b1ee90278f775 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Wed, 5 Jul 2023 19:25:09 +0300 Subject: [PATCH 43/72] minor fixes --- fedot/core/data/data_preprocessing.py | 6 +++--- fedot/preprocessing/data_types.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 40e2d23f42..dd710c0f14 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: def replace_inf_with_nans(input_data: InputData): features = input_data.features - has_infs = ((features == np.inf) | (features == -np.inf)) - if np.any(has_infs): - features[has_infs] = np.nan + is_inf = (features == np.inf) | (features == -np.inf) + if np.any(is_inf): + features[is_inf] = np.nan def replace_nans_with_empty_strings(input_data: InputData): diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 45bfca42bc..55bbe88441 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -93,7 +93,9 @@ def convert_data_for_fit(self, data: InputData): self._into_categorical_features_transformation_for_fit(data) # Save info about features and target types self.feature_type_ids = data.supplementary_data.column_types['features'].copy() - self.target_type_ids = data.supplementary_data.column_types.get('target', np.array()).copy() + self.target_type_ids = data.supplementary_data.column_types.get( + 'target', np.empty((self.feature_type_ids.shape[0], 1), dtype=float) + ).copy() self._retain_columns_info_without_types_conflicts(data) return data From 3d7043657fc8df3a36ee72ccdbca8c3648d9a0df Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Fri, 7 Jul 2023 20:32:15 +0300 Subject: [PATCH 44/72] minor dct update fix --- fedot/preprocessing/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 162611c28c..449dbbdf8c 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -88,7 +88,7 @@ def _train_encoder(self, column: pd.Series): encoder.fit(column) # Store fitted label encoder for transform method - self.binary_encoders.update({column.name: encoder}) + self.binary_encoders[column.name] = encoder def _apply_encoder(self, data: np.ndarray): """ From 6be9b2c01723e6087f7091242252fccb51f012af Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Fri, 7 Jul 2023 20:32:56 +0300 Subject: [PATCH 45/72] data_types.py further vectorization --- fedot/preprocessing/data_types.py | 223 ++++++++++++------------------ 1 file changed, 92 insertions(+), 131 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 55bbe88441..da0dc272fe 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -135,15 +135,7 @@ def feature_types_converting(self, features: np.ndarray) -> np.ndarray: """ mixed_types_columns = _find_mixed_types_columns(self.features_columns_info) cols_with_strings_or_floats = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER, _FLOAT_NUMBER]) - - def _update_converted_columns_and_data(column_info: pd.Series): - updated_column, new_type_id = self._convert_feature_into_one_type(features[:, column_info.name], - column_info) - self.features_converted_columns[column_info.name] = new_type_id - if updated_column is not None: - features[:, column_info.name] = updated_column - - cols_with_strings_or_floats.apply(_update_converted_columns_and_data) + cols_with_strings_or_floats.apply(self._convert_feature_into_one_type, features=features) return features @@ -155,15 +147,7 @@ def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray: """ mixed_types_columns = _find_mixed_types_columns(self.target_columns_info) cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER]) - - def _update_converted_columns_and_data(column_info: pd.Series): - updated_column, new_type_id = self._convert_target_into_one_type(target[:, column_info.name], column_info, - task) - self.target_converted_columns[column_info.name] = new_type_id - if updated_column is not None: - target[:, column_info.name] = updated_column - - cols_with_strings.apply(_update_converted_columns_and_data) + cols_with_strings.apply(self._convert_target_into_one_type, target=target, task=task) return target @@ -225,43 +209,46 @@ def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index # item is numeric, remove its value data.features[row_id, col_id] = np.nan - def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series): + def _convert_feature_into_one_type(self, column_info: pd.Series, features: np.ndarray): """ Determine new type for current feature column based on the string ratio. And then convert column into it. - :param mixed_column: one-dimensional array with several data types + :param features: one-dimensional array with several data types :param column_info: dictionary with information about types in the column :param mixed_column_id: index of column in dataset """ + new_type_id = None if len(column_info[_TYPES]) == 2 and TYPE_TO_ID[type(None)] in column_info[_TYPES]: # Column contain only one data type and nans - filtered_types = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]] - return mixed_column, filtered_types[0] - - string_objects_number = column_info[_STR_NUMBER] - all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum() - string_ratio = string_objects_number / all_elements_number - - if string_ratio > 0: - suggested_type = str + non_nan_type_lst = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]] + new_type_id = non_nan_type_lst[0] else: - suggested_type = _obtain_new_column_type(column_info) + string_objects_number = column_info[_STR_NUMBER] + all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum() + string_ratio = string_objects_number / all_elements_number - try: - mixed_column = mixed_column.astype(suggested_type) - # If there were nans in the column - paste nan - if column_info[_NAN_NUMBER]: - mixed_column = mixed_column.astype(object) - mixed_column[column_info[_NAN_IDS]] = np.nan - del column_info[_NAN_IDS] - return mixed_column, TYPE_TO_ID[suggested_type] - except ValueError: - # Cannot convert string objects into int or float (for example 'a' into int) - prefix = f'Feature column with index {column_info.name} contains ' \ - f'following data types: {column_info[_TYPES]}.' - self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.') - return None, None + if string_ratio > 0: + suggested_type = str + else: + suggested_type = _obtain_new_column_type(column_info) + + try: + converted = features[:, column_info.name].astype(suggested_type) + # If there were nans in the column - paste nan + if column_info[_NAN_NUMBER]: + converted = converted.astype(object) + converted[column_info[_NAN_IDS]] = np.nan + del column_info[_NAN_IDS] + features[:, column_info.name] = converted + except ValueError: + # Cannot convert string objects into int or float (for example 'a' into int) + prefix = (f'Feature column with index {column_info.name} contains ' + f'the following data types: {column_info[_TYPES]}.') + self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.') + else: + new_type_id = TYPE_TO_ID[suggested_type] + self.features_converted_columns[column_info.name] = new_type_id - def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series, + def _convert_target_into_one_type(self, column_info: pd.Series, target: np.ndarray, task: Task) -> Tuple[np.ndarray, str]: """ Convert target columns into one type based on column proportions of object and task """ if task.task_type is TaskTypesEnum.classification: @@ -269,21 +256,21 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: p suggested_type = str else: suggested_type = _obtain_new_column_type(column_info) + self.target_converted_columns[column_info.name] = TYPE_TO_ID[suggested_type] + mixed_column = target[:, column_info.name] try: - mixed_column = mixed_column.astype(suggested_type) - return mixed_column, TYPE_TO_ID[suggested_type] + target[:, column_info.name] = mixed_column.astype(suggested_type) except ValueError: # Cannot convert string objects into int or float (for example 'a' into int) - target_column = pd.Series(mixed_column) - converted_column = pd.to_numeric(target_column, errors='coerce') + converted_column = pd.to_numeric(mixed_column, errors='coerce') prefix = (f'Target column with index {column_info.name} contains ' - f'following data types: {column_info[_TYPES]}.') - log_message = f'{prefix} String cannot be converted into {suggested_type}. Ignore non converted values.' + f'the following data types: {column_info[_TYPES]}.') + log_message = f'{prefix} String cannot be converted into {suggested_type}. Set unconverted values to NaN.' self.log.debug(log_message) self.target_converting_has_errors = True - return converted_column.values, TYPE_TO_ID[suggested_type] + target[:, column_info.name] = converted_column def _into_categorical_features_transformation_for_fit(self, data: InputData): """ @@ -295,6 +282,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): numeric_type_ids = np.flatnonzero(is_numeric_type) num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) nuniques = num_df.nunique(dropna=True) + # reduce dataframe to include only categorical features num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)] cat_col_ids = num_df.columns @@ -307,10 +295,6 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ - if not self.numerical_into_str: - # There is no transformation for current table - return data - # Get numerical columns num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str) @@ -325,10 +309,9 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): """ Automatically determine categorical features which should be converted into float """ - str_columns = np.flatnonzero( - np.isin(data.supplementary_data.column_types['features'], TYPE_TO_ID[str]) - ) - str_cols_df = pd.DataFrame(data.features[:, str_columns], columns=str_columns) + is_str_type = data.supplementary_data.column_types['features'] == TYPE_TO_ID[str] + str_col_ids = np.flatnonzero(is_str_type) + str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids) orig_nans_cnt = str_cols_df.isna().sum(axis=0) converted_str_cols_df = str_cols_df.apply(pd.to_numeric, errors='coerce') @@ -348,12 +331,12 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): feature_types = data.supplementary_data.column_types['features'] feature_types[is_numeric_ids] = TYPE_TO_ID[float] - # The columns consists mostly of truly str values and has a few ints/floats in it + # The columns consist mostly of truly str values and has a few ints/floats in it is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1) self._remove_pseudo_str_values_from_str_column(data, is_mixed[is_mixed].index) # If column contains a lot of '?' or 'x' as nans equivalents - # add it remove list + # add it to remove list is_of_mistakes = ( (self.acceptable_failed_rate_bottom <= failed_ratio) & (failed_ratio < self.acceptable_failed_rate_top)) @@ -361,28 +344,27 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): def _into_numeric_features_transformation_for_predict(self, data: InputData): """ Apply conversion into float string column for every signed column """ - str_cols_ids = list(set(self.categorical_into_float) - .difference(self.string_columns_transformation_failed)) - str_cols_df = pd.DataFrame(data.features[:, str_cols_ids], columns=str_cols_ids) - data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() + str_col_ids = np.setdiff1d( + self.categorical_into_float, + list(self.string_columns_transformation_failed) + ).astype(int) + str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids) + data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() # Update information about column types (in-place) feature_types = data.supplementary_data.column_types['features'] - feature_types[str_cols_ids] = TYPE_TO_ID[float] + feature_types[str_col_ids] = TYPE_TO_ID[float] def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: """ Prepare information about types per columns. For each column store unique types, which column contains. """ - if table is None: - return pd.DataFrame() - table_of_types = pd.DataFrame(table, copy=True) table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8) # Build dataframe with unique types for each column - uniques = table_of_types.apply([pd.unique]).rename(index={'unique': _TYPES}) + uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T # Build dataframe with amount of each type counts_index_mapper = { @@ -394,14 +376,18 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: types_counts = ( table_of_types .apply(pd.value_counts, dropna=False) - .reindex(counts_index_mapper.keys(), copy=False) + .reindex(counts_index_mapper.keys(), copy=False) # Sets all type ids .replace(np.nan, 0) - .rename(index=counts_index_mapper, copy=False) + .rename(index=counts_index_mapper, copy=False) # Renames all type ids to strs .astype(int) ) # Build dataframe with nans indices - nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: _NAN_IDS}) + nans_ids = ( + (table_of_types == TYPE_TO_ID[type(None)]) + .apply(np.flatnonzero, result_type='reduce') + .to_frame(_NAN_IDS).T + ) # Combine all dataframes return pd.concat([uniques, types_counts, nans_ids]) @@ -409,13 +395,13 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: def _find_mixed_types_columns(columns_info: pd.DataFrame) -> pd.DataFrame: """ Search for columns with several types in them """ - has_mixed_types = [] if columns_info.empty else columns_info.loc[_TYPES].apply(len) > 1 + has_mixed_types = columns_info.loc[_TYPES].apply(len) > 1 return columns_info.loc[:, has_mixed_types] def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> pd.DataFrame: - _cols_have_any = [] if frame.empty else frame.loc[rows_to_select].any() - return frame.loc[:, _cols_have_any] + cols_have_any = frame.loc[rows_to_select].any() + return frame.loc[:, cols_have_any] def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: LoggerAdapter): @@ -424,27 +410,13 @@ def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: Lo transformation on predict stage when column types were already determined during fit """ + table_df = pd.DataFrame(table, copy=False) + types_sr = pd.Series(column_types).map({ + **{TYPE_TO_ID[t]: t for t in [int, str]}, + **{TYPE_TO_ID[t]: float for t in [bool, type(None), float]} + }) - def type_by_id(current_type_id: int): - """ Return type by its ID """ - if current_type_id == TYPE_TO_ID[int]: - return int - elif current_type_id == TYPE_TO_ID[str]: - return str - return float - - if table is None: - # Occurs if for predict stage there is no target info - return None - - _, n_cols = table.shape - for column_id in range(n_cols): - current_column = table[:, column_id] - current_type = type_by_id(column_types[column_id]) - _convert_predict_column_into_desired_type(table=table, current_column=current_column, current_type=current_type, - column_id=column_id, log=log) - - return table + return table_df.apply(_convert_predict_column_into_desired_type, types_sr=types_sr, log=log).to_numpy() def convert_num_column_into_string_array(numerical_column: pd.Series) -> pd.Series: @@ -464,20 +436,19 @@ def _obtain_new_column_type(column_info: pd.Series): return int -def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray, - column_id: int, current_type: type, log: LoggerAdapter): +def _convert_predict_column_into_desired_type(current_column: pd.Series, types_sr: pd.Series, log: LoggerAdapter): + current_type = types_sr.loc[current_column.name] try: - table[:, column_id] = current_column.astype(current_type) + converted_column = current_column.astype(current_type) if current_type is str: - is_any_comma = any(',' in el for el in current_column) - is_any_dot = any('.' in el for el in current_column) - # Most likely case: '20,000' must be converted into '20.000' - if is_any_comma and is_any_dot: - warning = f'Column {column_id} contains both "." and ",". Standardize it.' + has_comma_and_dot = np.isin(['.', ','], current_column).all() + if has_comma_and_dot: + # Most likely case: '20,000' must be converted into '20.000' + warning = f'Column {current_column.name} contains both "." and ",". Standardize it.' log.warning(warning) except ValueError: - table[:, column_id] = _process_predict_column_values_one_by_one(current_column=current_column, - current_type=current_type) + converted_column = current_column.apply(_process_predict_column_values_one_by_one, current_type=current_type) + return converted_column def _generate_list_with_types(columns_types_info: pd.DataFrame, @@ -510,29 +481,19 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame, return np.array(updated_column_types) -def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type: type): +def _process_predict_column_values_one_by_one(value, current_type: type): """ Process column values one by one and try to convert them into desirable type. If not successful replace with np.nan """ - - def _process_str_numbers_with_dots_and_commas(value: str): - """ Try to process str with replacing ',' by '.' in case it was meant to be a number """ - value = value.replace(',', '.') - new_value = np.nan - try: - # Since "10.6" can not be converted to 10 straightforward using int() - if current_type is int: - new_value = int(float(value)) - except ValueError: - pass - return new_value - - new_column = [] - for value in current_column: - new_value = np.nan - try: - new_value = current_type(value) - except ValueError: - if isinstance(value, str) and ('.' in value or ',' in value): - new_value = _process_str_numbers_with_dots_and_commas(value=value) - new_column.append(new_value) - return new_column + new_value = np.nan + try: + new_value = current_type(value) + except ValueError: + if isinstance(value, str) and ('.' in value or ',' in value): + value = value.replace(',', '.') + try: + # Since "10.6" can not be converted to 10 straightforward using int() + if current_type is int: + new_value = int(float(value)) + except ValueError: + pass + return new_value From d39e623a2c1db74310e4bd73a99c6d6939c749b9 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 10 Jul 2023 16:40:00 +0300 Subject: [PATCH 46/72] preprocessing simplifications and logical fixes --- fedot/core/data/data_preprocessing.py | 2 +- .../data_operations/categorical_encoders.py | 33 ++++++++----------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index dd710c0f14..26bc1ea462 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -63,7 +63,7 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda # Define if data contains string columns for "unknown table" return force_categorical_determination(table) - is_str = np.isin(column_type_ids, TYPE_TO_ID[str]) + is_str = column_type_ids == TYPE_TO_ID[str] categorical_ids = np.flatnonzero(is_str).tolist() non_categorical_ids = np.flatnonzero(~is_str).tolist() diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 55071ae5f2..34e08f5a45 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -1,8 +1,7 @@ from copy import deepcopy -from typing import Optional, List +from typing import List, Optional import numpy as np -import pandas as pd from sklearn.preprocessing import LabelEncoder, OneHotEncoder @@ -75,7 +74,7 @@ def _update_column_types(self, output_data: OutputData): if self.categorical_ids: # There are categorical features in the table feature_types = output_data.supplementary_data.column_types['features'] - numerical_columns = feature_types[np.isin(feature_types, TYPE_TO_ID[str], invert=True)] + numerical_columns = feature_types[feature_types != TYPE_TO_ID[str]] # Calculate new binary columns number after encoding encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns) @@ -123,10 +122,8 @@ def fit(self, input_data: InputData): self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, feature_type_ids) - # If there are categorical features - process it - if self.categorical_ids: - # For every categorical feature - perform encoding - self._fit_label_encoders(input_data) + # For every existing categorical feature - perform encoding + self._fit_label_encoders(input_data.features) return self.encoders def transform(self, input_data: InputData) -> OutputData: @@ -134,9 +131,8 @@ def transform(self, input_data: InputData) -> OutputData: Applicable during predict stage """ copied_data = deepcopy(input_data) - if self.categorical_ids: - # If categorical features exists - transform them inplace in InputData - self._apply_label_encoder(copied_data.features) + # If categorical features exist - transform them inplace in InputData + self._apply_label_encoder(copied_data.features) output_data = self._convert_to_output(copied_data, copied_data.features) @@ -149,14 +145,13 @@ def _update_column_types(self, output_data: OutputData): feature_types = output_data.supplementary_data.column_types['features'] feature_types[self.categorical_ids] = TYPE_TO_ID[int] - def _fit_label_encoders(self, input_data: InputData): + def _fit_label_encoders(self, data: np.ndarray): """ Fit LabelEncoder for every categorical column in the dataset """ - for categorical_id in self.categorical_ids: - categorical_column = input_data.features[:, categorical_id] + categorical_columns = data[:, self.categorical_ids].astype(str) + for column_id, column in zip(self.categorical_ids, categorical_columns.T): le = LabelEncoder() - le.fit(categorical_column) - - self.encoders.update({categorical_id: le}) + le.fit(column) + self.encoders[column_id] = le def _apply_label_encoder(self, data: np.ndarray): """ @@ -165,13 +160,13 @@ def _apply_label_encoder(self, data: np.ndarray): Args: data: numpy array with all features """ - categorical_columns = data[:, self.categorical_ids] + categorical_columns = data[:, self.categorical_ids].astype(str) for column_id, column in zip(self.categorical_ids, categorical_columns.T): column_encoder = self.encoders[column_id] - column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, column))) + column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, column))) transformed_column = column_encoder.transform(column) - nan_idxs = np.flatnonzero(pd.isna(column)) + nan_idxs = np.flatnonzero(column == 'nan') if len(nan_idxs): # Store np.nan values transformed_column = transformed_column.astype(object) From 280822e15d946f1f36e0a025ff66e13baa32990e Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 10 Jul 2023 17:20:00 +0300 Subject: [PATCH 47/72] minor test lint fix --- .../data_operations/test_data_operations_implementations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index 9064781164..ad55628231 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -169,7 +169,7 @@ def get_mixed_data(task=None, extended=False): [8, '1', '1', 0, '1', 'not blue', 'da bu'], [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object) feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], - TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]) + TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]) target_type_ids = np.array([TYPE_TO_ID[int]]) supp_data = SupplementaryData(column_types={'features': feature_type_ids, 'target': target_type_ids}) From e2e287afcae0b6593d3e5940cce4c99ab17247c8 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 10 Jul 2023 17:59:37 +0300 Subject: [PATCH 48/72] minor polishing --- .../data_operations/categorical_encoders.py | 31 +++++++------------ fedot/preprocessing/categorical.py | 8 ++--- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 34e08f5a45..dc6582a3e9 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -40,7 +40,7 @@ def fit(self, input_data: InputData): # If there are categorical features - process it if self.categorical_ids: - updated_cat_features = np.array(features[:, self.categorical_ids], dtype=str) + updated_cat_features = features[:, self.categorical_ids].astype(str) self.encoder.fit(updated_cat_features) return self.encoder @@ -55,13 +55,10 @@ def transform(self, input_data: InputData) -> OutputData: """ copied_data = deepcopy(input_data) - features = copied_data.features - if not self.categorical_ids: - # If there are no categorical features in the table - transformed_features = features - else: - # If categorical features are exists - transformed_features = self._apply_one_hot_encoding(features) + transformed_features = copied_data.features + if self.categorical_ids: + # If categorical features exist + transformed_features = self._apply_one_hot_encoding(transformed_features) # Update features output_data = self._convert_to_output(copied_data, @@ -90,19 +87,13 @@ def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray: :param features: tabular data for processing :return transformed_features: transformed features table """ + transformed_categorical = self.encoder.transform(features[:, self.categorical_ids]).toarray() - categorical_features = np.array(features[:, self.categorical_ids]) - transformed_categorical = self.encoder.transform(categorical_features).toarray() - - # If there are non-categorical features in the data - if not self.non_categorical_ids: - transformed_features = transformed_categorical - else: - # Stack transformed categorical and non-categorical data - non_categorical_features = np.array(features[:, self.non_categorical_ids]) - frames = (non_categorical_features, transformed_categorical) - transformed_features = np.hstack(frames) - self.encoded_ids = np.array(range(non_categorical_features.shape[1], transformed_features.shape[1])) + # Stack transformed categorical and non-categorical data, ignore if none + non_categorical_features = features[:, self.non_categorical_ids] + frames = (non_categorical_features, transformed_categorical) + transformed_features = np.hstack(frames) + self.encoded_ids = np.array(range(non_categorical_features.shape[1], transformed_features.shape[1])) return transformed_features diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 449dbbdf8c..bde52fed7e 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -32,7 +32,7 @@ def fit(self, input_data: InputData): for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T): pd_column = pd.Series(column, name=column_id, copy=True) is_nan = pd_column.isna() - column_nuniques = pd_column.nunique(False) + column_nuniques = pd_column.nunique(dropna=False) if is_nan.sum(): # This categorical column has nans pd_column[is_nan] = FEDOT_STR_NAN @@ -55,10 +55,6 @@ def transform(self, input_data: InputData) -> InputData: """ Apply transformation (converting str into integers) for selected (while training) features. """ - if len(self.binary_ids_to_convert) == 0: - # There are no binary categorical features - return input_data - copied_data = deepcopy(input_data) self._apply_encoder(copied_data.features) @@ -107,7 +103,7 @@ def _apply_encoder(self, data: np.ndarray): converted = encoder.transform(column) if len(nan_idxs): - # Column has nans in its structure - after conversion replace it + # Column has nans in its structure - replace them after conversion converted = converted.astype(float) converted[nan_idxs] = np.nan data[:, column_id] = converted From 752b4aca28813699c47ae95a6c4a2d31e041b8df Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 11 Jul 2023 17:47:58 +0300 Subject: [PATCH 49/72] applymap simplification data_types.py --- fedot/preprocessing/data_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index da0dc272fe..7643c0cc3d 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -361,7 +361,7 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: types, which column contains. """ table_of_types = pd.DataFrame(table, copy=True) - table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8) + table_of_types = table_of_types.replace(np.nan, None).applymap(lambda el: TYPE_TO_ID[type(el)]) # Build dataframe with unique types for each column uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T From 0148e3530bdc16f222559a57931af1af16f755e9 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Tue, 11 Jul 2023 17:53:02 +0300 Subject: [PATCH 50/72] test_pipeline.py increase time constraint --- test/unit/pipelines/test_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/pipelines/test_pipeline.py b/test/unit/pipelines/test_pipeline.py index 2608e9338f..bfd656c9bc 100644 --- a/test/unit/pipelines/test_pipeline.py +++ b/test/unit/pipelines/test_pipeline.py @@ -389,7 +389,7 @@ def test_pipeline_fit_time_constraint(): test_pipeline_second = pipeline_first() predicted_second = test_pipeline_second.fit(input_data=train_data, - time_constraint=datetime.timedelta(seconds=1.6)) + time_constraint=datetime.timedelta(seconds=2.1)) computation_time_second = test_pipeline_second.computation_time assert comp_time_proc_with_first_constraint < comp_time_proc_with_second_constraint assert computation_time_first is None From 432d9eafd22ec73c227e0b5187c31e9ee0e833e5 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 31 Jul 2023 12:59:36 +0300 Subject: [PATCH 51/72] rename all *types to *type_ids --- .../data/merge/supplementary_data_merger.py | 12 ++++---- .../data_operations/categorical_encoders.py | 8 ++--- .../data_operations/sklearn_selectors.py | 4 +-- .../sklearn_transformations.py | 4 +-- .../data_operations/ts_transformations.py | 8 ++--- fedot/preprocessing/categorical.py | 4 +-- fedot/preprocessing/data_types.py | 30 +++++++++---------- test/unit/preprocessing/test_preprocessors.py | 16 +++++----- 8 files changed, 42 insertions(+), 44 deletions(-) diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py index 866f450cc2..ea73a7464d 100644 --- a/fedot/core/data/merge/supplementary_data_merger.py +++ b/fedot/core/data/merge/supplementary_data_merger.py @@ -84,8 +84,8 @@ def merge_column_types(self) -> Dict[str, np.ndarray]: # Concatenate types for features columns and # choose target type of the main target as the new target type - new_feature_types = [] - new_target_types = None + new_feature_type_ids = [] + new_target_type_ids = None for output in self.outputs: if output.supplementary_data.column_types is None: self.log.debug('Perform determination of column types in DataMerger') @@ -93,10 +93,10 @@ def merge_column_types(self) -> Dict[str, np.ndarray]: output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict, output.target, output.task) - feature_types = output.supplementary_data.column_types['features'] - new_feature_types.extend(feature_types) + feature_type_ids = output.supplementary_data.column_types['features'] + new_feature_type_ids.extend(feature_type_ids) if output.supplementary_data.is_main_target: # Target can be None for predict stage - new_target_types = output.supplementary_data.column_types.get('target') - return {'features': np.array(new_feature_types), 'target': new_target_types} + new_target_type_ids = output.supplementary_data.column_types.get('target') + return {'features': np.array(new_feature_type_ids), 'target': new_target_type_ids} diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index dc6582a3e9..7c5323d891 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -70,8 +70,8 @@ def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical columns becomes integer with extension """ if self.categorical_ids: # There are categorical features in the table - feature_types = output_data.supplementary_data.column_types['features'] - numerical_columns = feature_types[feature_types != TYPE_TO_ID[str]] + feature_type_ids = output_data.supplementary_data.column_types['features'] + numerical_columns = feature_type_ids[feature_type_ids != TYPE_TO_ID[str]] # Calculate new binary columns number after encoding encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns) @@ -133,8 +133,8 @@ def transform(self, input_data: InputData) -> OutputData: def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical becomes integer """ - feature_types = output_data.supplementary_data.column_types['features'] - feature_types[self.categorical_ids] = TYPE_TO_ID[int] + feature_type_ids = output_data.supplementary_data.column_types['features'] + feature_type_ids[self.categorical_ids] = TYPE_TO_ID[int] def _fit_label_encoders(self, data: np.ndarray): """ Fit LabelEncoder for every categorical column in the dataset """ diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py index 23c56329e1..1248865b85 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py @@ -81,10 +81,10 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): cols_number_removed = source_features_shape[1] - output_data.predict.shape[1] if cols_number_removed: # There are several columns, which were dropped - feature_types = output_data.supplementary_data.column_types['features'] + feature_type_ids = output_data.supplementary_data.column_types['features'] # Calculate - output_data.supplementary_data.column_types['features'] = feature_types[self.remain_features_mask] + output_data.supplementary_data.column_types['features'] = feature_type_ids[self.remain_features_mask] def _make_new_table(self, features): """ diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 05d824e320..67a3c9bbe1 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -196,9 +196,9 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): cols_number_added = output_data.predict.shape[1] - source_features_shape[1] if cols_number_added > 0: # There are new columns in the table - feature_types = output_data.supplementary_data.column_types['features'] + feature_type_ids = output_data.supplementary_data.column_types['features'] new_types = [TYPE_TO_ID[float]] * cols_number_added - output_data.supplementary_data.column_types['features'] = np.append(feature_types, new_types) + output_data.supplementary_data.column_types['features'] = np.append(feature_type_ids, new_types) class ScalingImplementation(EncodedInvariantImplementation): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py index 67ce85ff3d..6881b50e9c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py @@ -128,13 +128,13 @@ def _update_column_types(self, output_data: OutputData): """ _, features_n_cols = output_data.predict.shape - feature_types = np.array([TYPE_TO_ID[float]] * features_n_cols) - column_types = {'features': feature_types} + feature_type_ids = np.array([TYPE_TO_ID[float]] * features_n_cols) + column_types = {'features': feature_type_ids} if output_data.target is not None and len(output_data.target.shape) > 1: _, target_n_cols = output_data.target.shape - target_types = np.array([TYPE_TO_ID[float]] * target_n_cols) - column_types['target'] = target_types + target_type_ids = np.array([TYPE_TO_ID[float]] * target_n_cols) + column_types['target'] = target_type_ids output_data.supplementary_data.column_types = column_types def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array, diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index bde52fed7e..fa5da3583e 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -59,8 +59,8 @@ def transform(self, input_data: InputData) -> InputData: self._apply_encoder(copied_data.features) # Update features types - feature_types = copied_data.supplementary_data.column_types['features'] - feature_types[self.binary_ids_to_convert] = TYPE_TO_ID[int] + feature_type_ids = copied_data.supplementary_data.column_types['features'] + feature_type_ids[self.binary_ids_to_convert] = TYPE_TO_ID[int] return copied_data def fit_transform(self, input_data: InputData) -> InputData: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 7643c0cc3d..f046306805 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -164,15 +164,15 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = self.target_columns_info = define_column_types(target) target = self.target_types_converting(target=target, task=task) - feature_types = _generate_list_with_types(self.features_columns_info, self.features_converted_columns) - self._check_columns_vs_types_number(predictors, feature_types) + feature_type_ids = _generate_list_with_types(self.features_columns_info, self.features_converted_columns) + self._check_columns_vs_types_number(predictors, feature_type_ids) if target is None or task.task_type is TaskTypesEnum.ts_forecasting: - return {'features': feature_types} + return {'features': feature_type_ids} else: - target_types = _generate_list_with_types(self.target_columns_info, self.target_converted_columns) - self._check_columns_vs_types_number(target, target_types) - return {'features': feature_types, 'target': target_types} + target_type_ids = _generate_list_with_types(self.target_columns_info, self.target_converted_columns) + self._check_columns_vs_types_number(target, target_type_ids) + return {'features': feature_type_ids, 'target': target_type_ids} def _retain_columns_info_without_types_conflicts(self, data: InputData): """ Update information in supplementary info - retain info only about remained columns. @@ -277,8 +277,8 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - feature_types = data.supplementary_data.column_types['features'] - is_numeric_type = np.isin(feature_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) + feature_type_ids = data.supplementary_data.column_types['features'] + is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) numeric_type_ids = np.flatnonzero(is_numeric_type) num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) nuniques = num_df.nunique(dropna=True) @@ -291,7 +291,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): # Columns need to be transformed into categorical (string) ones self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) # Update information about column types (in-place) - feature_types[cat_col_ids] = TYPE_TO_ID[str] + feature_type_ids[cat_col_ids] = TYPE_TO_ID[str] def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -302,8 +302,8 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy() # Update information about column types (in-place) - feature_types = data.supplementary_data.column_types['features'] - feature_types[self.numerical_into_str] = TYPE_TO_ID[str] + feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids[self.numerical_into_str] = TYPE_TO_ID[str] def _into_numeric_features_transformation_for_fit(self, data: InputData): """ @@ -328,8 +328,8 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float)) # Update information about column types (in-place) - feature_types = data.supplementary_data.column_types['features'] - feature_types[is_numeric_ids] = TYPE_TO_ID[float] + feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids[is_numeric_ids] = TYPE_TO_ID[float] # The columns consist mostly of truly str values and has a few ints/floats in it is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1) @@ -352,8 +352,8 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() # Update information about column types (in-place) - feature_types = data.supplementary_data.column_types['features'] - feature_types[str_col_ids] = TYPE_TO_ID[float] + feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids[str_col_ids] = TYPE_TO_ID[float] def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index afb64661a3..cd2a95f3da 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -129,15 +129,13 @@ def test_column_types_converting_correctly(): types_corr = TableTypesCorrector() data = types_corr.convert_data_for_fit(data) - feature_types = data.supplementary_data.column_types['features'] - target_types = data.supplementary_data.column_types['target'] - - assert len(feature_types) == 4 - assert len(target_types) == 2 - assert feature_types[0] == TYPE_TO_ID[str] - assert feature_types[1] == TYPE_TO_ID[str] - assert feature_types[2] == TYPE_TO_ID[str] - assert (target_types == TYPE_TO_ID[str]).all() + feature_type_ids = data.supplementary_data.column_types['features'] + target_type_ids = data.supplementary_data.column_types['target'] + + assert len(feature_type_ids) == 4 + assert len(target_type_ids) == 2 + assert (feature_type_ids[[0, 1, 2]] == TYPE_TO_ID[str]).all() + assert (target_type_ids == TYPE_TO_ID[str]).all() def test_column_types_process_correctly(): From 3c338d98ba0f8f225daebe13dcc68c171241d0d4 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 31 Jul 2023 13:26:34 +0300 Subject: [PATCH 52/72] rename column_types to col_type_ids --- fedot/core/data/data_preprocessing.py | 2 +- .../data/merge/supplementary_data_merger.py | 12 ++--- fedot/core/data/supplementary_data.py | 4 +- .../data_operations/categorical_encoders.py | 10 ++--- .../data_operations/sklearn_selectors.py | 4 +- .../sklearn_transformations.py | 8 ++-- .../data_operations/ts_transformations.py | 6 +-- fedot/core/operations/model.py | 20 ++++----- fedot/preprocessing/categorical.py | 4 +- fedot/preprocessing/data_types.py | 45 ++++++++++--------- fedot/preprocessing/preprocessing.py | 4 +- test/unit/data/test_supplementary_data.py | 10 ++--- .../test_data_operations_implementations.py | 14 +++--- .../test_preprocessing_through_api.py | 2 +- test/unit/preprocessing/test_preprocessors.py | 8 ++-- 15 files changed, 77 insertions(+), 76 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 26bc1ea462..b5519034c7 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -98,7 +98,7 @@ def data_has_categorical_features(data: InputData) -> bool: if data.data_type is not DataTypesEnum.table: return False - column_type_ids = data.supplementary_data.column_types.get('features') + column_type_ids = data.supplementary_data.col_type_ids['features'] cat_ids, non_cat_ids = find_categorical_columns(data.features, column_type_ids) data_has_categorical_columns = len(cat_ids) > 0 diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py index ea73a7464d..6a4c747e4a 100644 --- a/fedot/core/data/merge/supplementary_data_merger.py +++ b/fedot/core/data/merge/supplementary_data_merger.py @@ -24,7 +24,7 @@ def merge(self) -> SupplementaryData: obligatorily_preprocessed=self.all_preprocessed(), optionally_preprocessed=self.all_preprocessed(is_obligatory=False), non_int_idx=None, # is set elsewhere (by preprocessor or during pipeline fit/predict) - column_types=self.merge_column_types() + col_type_ids=self.merge_column_types() ) def calculate_dataflow_len(self) -> int: @@ -80,23 +80,23 @@ def merge_column_types(self) -> Dict[str, np.ndarray]: """ Store information about column types in tabular data for merged data """ if self.main_output.data_type is not DataTypesEnum.table: # Data is not tabular - return self.main_output.supplementary_data.column_types + return self.main_output.supplementary_data.col_type_ids # Concatenate types for features columns and # choose target type of the main target as the new target type new_feature_type_ids = [] new_target_type_ids = None for output in self.outputs: - if output.supplementary_data.column_types is None: + if output.supplementary_data.col_type_ids is None: self.log.debug('Perform determination of column types in DataMerger') table_corr = TableTypesCorrector() - output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict, + output.supplementary_data.col_type_ids = table_corr.prepare_column_types_info(output.predict, output.target, output.task) - feature_type_ids = output.supplementary_data.column_types['features'] + feature_type_ids = output.supplementary_data.col_type_ids['features'] new_feature_type_ids.extend(feature_type_ids) if output.supplementary_data.is_main_target: # Target can be None for predict stage - new_target_type_ids = output.supplementary_data.column_types.get('target') + new_target_type_ids = output.supplementary_data.col_type_ids.get('target') return {'features': np.array(new_feature_type_ids), 'target': new_target_type_ids} diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py index bd9f8d8881..8a053be9a9 100644 --- a/fedot/core/data/supplementary_data.py +++ b/fedot/core/data/supplementary_data.py @@ -27,8 +27,8 @@ class SupplementaryData: optionally_preprocessed: bool = False # Collection with non-int indexes non_int_idx: Optional[list] = None - # Dictionary with features and target column types - column_types: Optional[Dict[str, np.ndarray]] = None + # Dictionary with features and target column type numeric identificators + col_type_ids: Optional[Dict[str, np.ndarray]] = None @property def compound_mask(self): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 7c5323d891..5d2993417b 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -35,7 +35,7 @@ def fit(self, input_data: InputData): :return encoder: trained encoder (optional output) """ features = input_data.features - feature_type_ids = input_data.supplementary_data.column_types['features'] + feature_type_ids = input_data.supplementary_data.col_type_ids['features'] self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, feature_type_ids) # If there are categorical features - process it @@ -70,7 +70,7 @@ def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical columns becomes integer with extension """ if self.categorical_ids: # There are categorical features in the table - feature_type_ids = output_data.supplementary_data.column_types['features'] + feature_type_ids = output_data.supplementary_data.col_type_ids['features'] numerical_columns = feature_type_ids[feature_type_ids != TYPE_TO_ID[str]] # Calculate new binary columns number after encoding @@ -78,7 +78,7 @@ def _update_column_types(self, output_data: OutputData): numerical_columns = np.append(numerical_columns, [TYPE_TO_ID[int]] * encoded_columns_number) output_data.encoded_idx = self.encoded_ids - output_data.supplementary_data.column_types['features'] = numerical_columns + output_data.supplementary_data.col_type_ids['features'] = numerical_columns def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray: """ @@ -109,7 +109,7 @@ def __init__(self, params: Optional[OperationParameters] = None): self.non_categorical_ids: List[int] = None def fit(self, input_data: InputData): - feature_type_ids = input_data.supplementary_data.column_types['features'] + feature_type_ids = input_data.supplementary_data.col_type_ids['features'] self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, feature_type_ids) @@ -133,7 +133,7 @@ def transform(self, input_data: InputData) -> OutputData: def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical becomes integer """ - feature_type_ids = output_data.supplementary_data.column_types['features'] + feature_type_ids = output_data.supplementary_data.col_type_ids['features'] feature_type_ids[self.categorical_ids] = TYPE_TO_ID[int] def _fit_label_encoders(self, data: np.ndarray): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py index 1248865b85..8444b4eaf0 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py @@ -81,10 +81,10 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): cols_number_removed = source_features_shape[1] - output_data.predict.shape[1] if cols_number_removed: # There are several columns, which were dropped - feature_type_ids = output_data.supplementary_data.column_types['features'] + feature_type_ids = output_data.supplementary_data.col_type_ids['features'] # Calculate - output_data.supplementary_data.column_types['features'] = feature_type_ids[self.remain_features_mask] + output_data.supplementary_data.col_type_ids['features'] = feature_type_ids[self.remain_features_mask] def _make_new_table(self, features): """ diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 67a3c9bbe1..df87b7b1b5 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -89,7 +89,7 @@ def update_column_types(output_data: OutputData) -> OutputData: """ _, n_cols = output_data.predict.shape - output_data.supplementary_data.column_types['features'] = np.array([TYPE_TO_ID[float]] * n_cols) + output_data.supplementary_data.col_type_ids['features'] = np.array([TYPE_TO_ID[float]] * n_cols) return output_data @@ -196,9 +196,9 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): cols_number_added = output_data.predict.shape[1] - source_features_shape[1] if cols_number_added > 0: # There are new columns in the table - feature_type_ids = output_data.supplementary_data.column_types['features'] + feature_type_ids = output_data.supplementary_data.col_type_ids['features'] new_types = [TYPE_TO_ID[float]] * cols_number_added - output_data.supplementary_data.column_types['features'] = np.append(feature_type_ids, new_types) + output_data.supplementary_data.col_type_ids['features'] = np.append(feature_type_ids, new_types) class ScalingImplementation(EncodedInvariantImplementation): @@ -290,7 +290,7 @@ def transform(self, input_data: InputData) -> OutputData: replace_inf_with_nans(input_data) if data_type_is_table(input_data) and data_has_categorical_features(input_data): - feature_type_ids = input_data.supplementary_data.column_types['features'] + feature_type_ids = input_data.supplementary_data.col_type_ids['features'] self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, feature_type_ids) numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids, diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py index 6881b50e9c..985762ab29 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py @@ -129,13 +129,13 @@ def _update_column_types(self, output_data: OutputData): _, features_n_cols = output_data.predict.shape feature_type_ids = np.array([TYPE_TO_ID[float]] * features_n_cols) - column_types = {'features': feature_type_ids} + col_type_ids = {'features': feature_type_ids} if output_data.target is not None and len(output_data.target.shape) > 1: _, target_n_cols = output_data.target.shape target_type_ids = np.array([TYPE_TO_ID[float]] * target_n_cols) - column_types['target'] = target_type_ids - output_data.supplementary_data.column_types = column_types + col_type_ids['target'] = target_type_ids + output_data.supplementary_data.col_type_ids = col_type_ids def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array, forecast_length: int, old_idx: np.array): diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py index 250016fc74..99caabc25b 100644 --- a/fedot/core/operations/model.py +++ b/fedot/core/operations/model.py @@ -35,39 +35,39 @@ def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> Ou # Add information about features if is_regression_task or is_ts_forecasting_task: if len(predict_shape) < 2: - column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} + col_type_ids = {'features': [TYPE_TO_ID[float]] * predict_shape[0]} else: - column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} + col_type_ids = {'features': [TYPE_TO_ID[float]] * predict_shape[1]} else: if len(predict_shape) < 2: output_data.predict = output_data.predict.reshape((-1, 1)) predict_shape = output_data.predict.shape # Classification task or clustering target_type = int if output_mode == 'labels' else float - column_types = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]} + col_type_ids = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]} # Make feature types static to suit supplementary data contract - column_types['features'] = np.array(column_types['features']) + col_type_ids['features'] = np.array(col_type_ids['features']) # Add information about target target_shape = output_data.target.shape if output_data.target is not None else None if target_shape is None: # There is no target column in output data - output_data.supplementary_data.column_types = column_types + output_data.supplementary_data.col_type_ids = col_type_ids return output_data if is_regression_task or is_ts_forecasting_task: if len(target_shape) > 1: - column_types['target'] = [TYPE_TO_ID[float]] * target_shape[1] + col_type_ids['target'] = [TYPE_TO_ID[float]] * target_shape[1] else: # Array present "time series" - column_types['target'] = [TYPE_TO_ID[float]] * len(output_data.target) + col_type_ids['target'] = [TYPE_TO_ID[float]] * len(output_data.target) else: # Classification task or clustering - column_types['target'] = [TYPE_TO_ID[int]] * predict_shape[1] + col_type_ids['target'] = [TYPE_TO_ID[int]] * predict_shape[1] # Make target types static to suit supplementary data contract - column_types['target'] = np.array(column_types['target']) + col_type_ids['target'] = np.array(col_type_ids['target']) - output_data.supplementary_data.column_types = column_types + output_data.supplementary_data.col_type_ids = col_type_ids return output_data diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index fa5da3583e..c0ea6913eb 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -24,7 +24,7 @@ def fit(self, input_data: InputData): Find indices of columns which are contains categorical values. Binary features and at the same time has str objects. If there are such features - convert it into int """ - feature_type_ids = input_data.supplementary_data.column_types['features'] + feature_type_ids = input_data.supplementary_data.col_type_ids['features'] categorical_ids, _ = find_categorical_columns(input_data.features, feature_type_ids) @@ -59,7 +59,7 @@ def transform(self, input_data: InputData) -> InputData: self._apply_encoder(copied_data.features) # Update features types - feature_type_ids = copied_data.supplementary_data.column_types['features'] + feature_type_ids = copied_data.supplementary_data.col_type_ids['features'] feature_type_ids[self.binary_ids_to_convert] = TYPE_TO_ID[int] return copied_data diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index f046306805..19732e2196 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Tuple, Optional, List, Dict, Sequence +from collections.abc import Sequence +from typing import TYPE_CHECKING, Tuple, Optional, List, Dict import numpy as np import pandas as pd @@ -84,7 +85,7 @@ def convert_data_for_fit(self, data: InputData): # And in target(s) data.target = self.target_types_converting(target=data.target, task=data.task) - data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, + data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) @@ -92,8 +93,8 @@ def convert_data_for_fit(self, data: InputData): # Launch conversion float and integer features into categorical self._into_categorical_features_transformation_for_fit(data) # Save info about features and target types - self.feature_type_ids = data.supplementary_data.column_types['features'].copy() - self.target_type_ids = data.supplementary_data.column_types.get( + self.feature_type_ids = data.supplementary_data.col_type_ids['features'].copy() + self.target_type_ids = data.supplementary_data.col_type_ids.get( 'target', np.empty((self.feature_type_ids.shape[0], 1), dtype=float) ).copy() @@ -107,7 +108,7 @@ def convert_data_for_predict(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.features_converted_columns) data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log) data.target = apply_type_transformation(data.target, self.target_type_ids, self.log) - data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features, + data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) @@ -184,15 +185,15 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) - data.supplementary_data.column_types['features'] = np.delete( - data.supplementary_data.column_types['features'], + data.supplementary_data.col_type_ids['features'] = np.delete( + data.supplementary_data.col_type_ids['features'], list(self.string_columns_transformation_failed) ) - def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list): + def _check_columns_vs_types_number(self, table: np.ndarray, col_type_ids: Sequence): # Check if columns number correct _, n_cols = table.shape - if n_cols != len(column_types): + if n_cols != len(col_type_ids): # There is an incorrect types calculation self.log.warning('Columns number and types numbers do not match.') @@ -277,7 +278,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids = data.supplementary_data.col_type_ids['features'] is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) numeric_type_ids = np.flatnonzero(is_numeric_type) num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) @@ -302,14 +303,14 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData) data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy() # Update information about column types (in-place) - feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids = data.supplementary_data.col_type_ids['features'] feature_type_ids[self.numerical_into_str] = TYPE_TO_ID[str] def _into_numeric_features_transformation_for_fit(self, data: InputData): """ Automatically determine categorical features which should be converted into float """ - is_str_type = data.supplementary_data.column_types['features'] == TYPE_TO_ID[str] + is_str_type = data.supplementary_data.col_type_ids['features'] == TYPE_TO_ID[str] str_col_ids = np.flatnonzero(is_str_type) str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids) orig_nans_cnt = str_cols_df.isna().sum(axis=0) @@ -328,7 +329,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float)) # Update information about column types (in-place) - feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids = data.supplementary_data.col_type_ids['features'] feature_type_ids[is_numeric_ids] = TYPE_TO_ID[float] # The columns consist mostly of truly str values and has a few ints/floats in it @@ -352,7 +353,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData): data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy() # Update information about column types (in-place) - feature_type_ids = data.supplementary_data.column_types['features'] + feature_type_ids = data.supplementary_data.col_type_ids['features'] feature_type_ids[str_col_ids] = TYPE_TO_ID[float] @@ -404,14 +405,14 @@ def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> return frame.loc[:, cols_have_any] -def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: LoggerAdapter): +def apply_type_transformation(table: np.ndarray, col_type_ids: Sequence, log: LoggerAdapter): """ Apply transformation for columns in dataset into desired type. Perform transformation on predict stage when column types were already determined during fit """ table_df = pd.DataFrame(table, copy=False) - types_sr = pd.Series(column_types).map({ + types_sr = pd.Series(col_type_ids).map({ **{TYPE_TO_ID[t]: t for t in [int, str]}, **{TYPE_TO_ID[t]: float for t in [bool, type(None), float]} }) @@ -458,27 +459,27 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame, :param columns_types_info: dictionary with initial column types :param converted_columns: dictionary with transformed column types """ - updated_column_types = [] + updated_col_type_ids = [] for column_id, column_type_ids in columns_types_info.loc[_TYPES].items(): if len(column_type_ids) == 1: # Column initially contain only one type - updated_column_types.append(column_type_ids[0]) + updated_col_type_ids.append(column_type_ids[0]) elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids: # Column with one type and nans filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]] - updated_column_types.append(filtered_types[0]) + updated_col_type_ids.append(filtered_types[0]) else: if TYPE_TO_ID[str] in column_type_ids: # Mixed-types column with string new_col_id = converted_columns[column_id] if new_col_id is not None: - updated_column_types.append(new_col_id) + updated_col_type_ids.append(new_col_id) else: # Mixed-types with float and integer - updated_column_types.append(TYPE_TO_ID[float]) + updated_col_type_ids.append(TYPE_TO_ID[float]) - return np.array(updated_column_types) + return np.array(updated_col_type_ids) def _process_predict_column_values_one_by_one(value, current_type: type): diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 3ad213c912..6292cc786c 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -402,7 +402,7 @@ def _train_target_encoder(self, data: InputData, source_name: str): data: data to be encoded source_name: name of the data source node """ - categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.column_types.get('target')) + categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.col_type_ids.get('target')) if categorical_ids: # Target is categorical @@ -428,7 +428,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra encoded_target = data.target if encoder is not None: # Target encoders have already been fitted - data.supplementary_data.column_types['target'] = np.array([TYPE_TO_ID[int]]) + data.supplementary_data.col_type_ids['target'] = np.array([TYPE_TO_ID[int]]) encoded_target = encoder.transform(encoded_target) if len(encoded_target.shape) == 1: encoded_target = encoded_target.reshape((-1, 1)) diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py index 7c768392b5..0a4f9beaa1 100644 --- a/test/unit/data/test_supplementary_data.py +++ b/test/unit/data/test_supplementary_data.py @@ -20,14 +20,14 @@ def outputs_table_with_different_types(): task = Task(TaskTypesEnum.regression) idx = [0, 1, 2] target = [1, 2, 10] - data_info_first = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str], TYPE_TO_ID[float]]), + data_info_first = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[str], TYPE_TO_ID[float]]), 'target': np.array([TYPE_TO_ID[int]])}) output_first = OutputData(idx=idx, features=None, predict=np.array([['a', 1.1], ['b', 2], ['c', 3]], dtype=object), task=task, target=target, data_type=DataTypesEnum.table, supplementary_data=data_info_first) - data_info_second = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]]), + data_info_second = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[float]]), 'target': np.array([TYPE_TO_ID[int]])}) output_second = OutputData(idx=idx, features=None, predict=np.array([[2.5], [2.1], [9.3]], dtype=float), @@ -118,11 +118,11 @@ def test_define_types_after_merging(outputs_table_with_different_types): merged_data = DataMerger.get(outputs).merge() updated_info = merged_data.supplementary_data - feature_type_ids = updated_info.column_types['features'] - target_type_ids = updated_info.column_types['target'] + feature_type_ids = updated_info.col_type_ids['features'] + target_type_ids = updated_info.col_type_ids['target'] # Target type must stay the same - ancestor_target_type = outputs[0].supplementary_data.column_types['target'][0] + ancestor_target_type = outputs[0].supplementary_data.col_type_ids['target'][0] assert target_type_ids[0] == ancestor_target_type assert len(feature_type_ids) == 3 assert tuple(feature_type_ids) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float]) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index ad55628231..b5832b1bc1 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -129,7 +129,7 @@ def get_multivariate_time_series(mutli_ts=False): def get_nan_inf_data(): - supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]] * 4)}) + supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[float]] * 4)}) train_input = InputData(idx=[0, 1, 2, 3], features=np.array([[1, 2, 3, 4], [2, np.nan, 4, 5], @@ -144,7 +144,7 @@ def get_nan_inf_data(): def get_single_feature_data(task=None): - supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int]]), + supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[int]]), 'target': np.array([TYPE_TO_ID[int]])}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=np.array([[1], [2], [3], [7], [8], [9]]), @@ -171,7 +171,7 @@ def get_mixed_data(task=None, extended=False): feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]) target_type_ids = np.array([TYPE_TO_ID[int]]) - supp_data = SupplementaryData(column_types={'features': feature_type_ids, + supp_data = SupplementaryData(col_type_ids={'features': feature_type_ids, 'target': target_type_ids}) else: features = np.array([[1, '0', 1], @@ -182,7 +182,7 @@ def get_mixed_data(task=None, extended=False): [9, '0', 0]], dtype=object) feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) target_type_ids = np.array([TYPE_TO_ID[int]]) - supp_data = SupplementaryData(column_types={'features': feature_type_ids, + supp_data = SupplementaryData(col_type_ids={'features': feature_type_ids, 'target': target_type_ids}) train_input = InputData(idx=[0, 1, 2, 3, 4, 5], @@ -203,7 +203,7 @@ def get_nan_binary_data(task=None): For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]) - supp_data = SupplementaryData(column_types={'features': feature_type_ids}) + supp_data = SupplementaryData(col_type_ids={'features': feature_type_ids}) features = np.array([[1, '0', 0], [np.nan, np.nan, np.nan], [0, '2', 1], @@ -232,7 +232,7 @@ def get_unbalanced_dataset(size=10, disbalance=0.4, target_dim=None): if target_dim == 2: target = target.reshape(-1, 1) - supp_data = SupplementaryData(column_types={ + supp_data = SupplementaryData(col_type_ids={ 'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[str]]), 'target': np.array([TYPE_TO_ID[int]]) }) @@ -253,7 +253,7 @@ def data_with_binary_int_features_and_equal_categories(): must be processed as "almost categorical". Current dataset For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33 """ - supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[int]])}) + supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[int]])}) task = Task(TaskTypesEnum.classification) features = np.array([[1, 10], [np.nan, np.nan], diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py index 5726b041df..a4d0e83fd8 100644 --- a/test/unit/preprocessing/test_preprocessing_through_api.py +++ b/test/unit/preprocessing/test_preprocessing_through_api.py @@ -11,7 +11,7 @@ def data_with_only_categorical_features(): """ Generate tabular data with only categorical features. All of them are binary. """ - supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str]] * 3)}) + supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[str]] * 3)}) task = Task(TaskTypesEnum.regression) features = np.array([["'a'", "0", "1"], ["'b'", "1", "0"], diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index cd2a95f3da..9cc3027ebb 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -129,8 +129,8 @@ def test_column_types_converting_correctly(): types_corr = TableTypesCorrector() data = types_corr.convert_data_for_fit(data) - feature_type_ids = data.supplementary_data.column_types['features'] - target_type_ids = data.supplementary_data.column_types['target'] + feature_type_ids = data.supplementary_data.col_type_ids['features'] + target_type_ids = data.supplementary_data.col_type_ids['target'] assert len(feature_type_ids) == 4 assert len(target_type_ids) == 2 @@ -154,7 +154,7 @@ def test_column_types_process_correctly(): pipeline.fit(train_data) predicted = pipeline.predict(test_data) - feature_type_ids = predicted.supplementary_data.column_types['features'] + feature_type_ids = predicted.supplementary_data.col_type_ids['features'] assert len(feature_type_ids) == predicted.predict.shape[1] # All output values are float assert (feature_type_ids == TYPE_TO_ID[float]).all() @@ -262,7 +262,7 @@ def test_str_numbers_with_dots_and_commas_in_predict(): input_data = InputData(idx=np.arange(4), features=features, target=target, task=task, data_type=DataTypesEnum.table) - transformed_predict = apply_type_transformation(table=input_data.features, column_types=[TYPE_TO_ID[int]], + transformed_predict = apply_type_transformation(table=input_data.features, col_type_ids=[TYPE_TO_ID[int]], log=default_log('test_str_numbers_with_dots_and_commas_in_predict')) assert all(transformed_predict == np.array([[8], [4], [3], [6]])) From 3fddcc8b5293154f7f4a309d840018b29fe771f8 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 31 Jul 2023 13:59:38 +0300 Subject: [PATCH 53/72] pandas version fix --- fedot/preprocessing/data_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 19732e2196..96278bf0b3 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -362,7 +362,7 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame: types, which column contains. """ table_of_types = pd.DataFrame(table, copy=True) - table_of_types = table_of_types.replace(np.nan, None).applymap(lambda el: TYPE_TO_ID[type(el)]) + table_of_types = table_of_types.replace({np.nan: None}).applymap(lambda el: TYPE_TO_ID[type(el)]) # Build dataframe with unique types for each column uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T From 315ab99d961aea9ca60a57492eaa8a4750858b02 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Mon, 31 Jul 2023 14:00:29 +0300 Subject: [PATCH 54/72] inf condition simplification --- fedot/core/data/data_preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index b5519034c7..c32adf36ff 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -14,7 +14,7 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: def replace_inf_with_nans(input_data: InputData): features = input_data.features - is_inf = (features == np.inf) | (features == -np.inf) + is_inf = np.isin(features, [np.inf, -np.inf]) if np.any(is_inf): features[is_inf] = np.nan From 5240a6c709f2e83967fb3cc6690d093a188fba2c Mon Sep 17 00:00:00 2001 From: Sergei Pakulin Date: Tue, 29 Aug 2023 19:18:53 +0500 Subject: [PATCH 55/72] upd gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 961f017444..cdacb38f3e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea/ +.vscode/ **/.pytest_cache/ **/__pycache__/ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm @@ -76,4 +77,4 @@ dist/ test/unit/test_log.log test/unit/catboost_info -local/ \ No newline at end of file +local/ From 9a770e007b6a1cb97cbb53ed8d5d385e49ba53b4 Mon Sep 17 00:00:00 2001 From: Sergei Pakulin Date: Wed, 30 Aug 2023 12:07:55 +0500 Subject: [PATCH 56/72] lint fixes --- fedot/api/api_utils/input_analyser.py | 6 ++---- fedot/core/data/data.py | 2 -- fedot/core/data/data_preprocessing.py | 4 ++-- fedot/core/data/merge/supplementary_data_merger.py | 2 +- fedot/core/data/supplementary_data.py | 2 +- .../data_operations/categorical_encoders.py | 1 - .../data_operations/sklearn_selectors.py | 2 +- .../data_operations/sklearn_transformations.py | 8 ++++---- .../implementation_interfaces.py | 2 +- fedot/core/operations/operation.py | 3 ++- fedot/core/repository/json_evaluation.py | 2 +- fedot/core/repository/operation_types_repository.py | 4 ++-- fedot/preprocessing/base_preprocessing.py | 3 ++- fedot/preprocessing/categorical.py | 2 +- fedot/preprocessing/data_types.py | 3 ++- fedot/preprocessing/dummy_preprocessing.py | 2 +- fedot/preprocessing/preprocessing.py | 6 +++--- test/integration/models/test_repository.py | 2 +- test/unit/preprocessing/test_preprocessors.py | 6 +++--- 19 files changed, 30 insertions(+), 32 deletions(-) diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index 3b9c5f2c23..d626835741 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -1,18 +1,16 @@ from functools import partial from inspect import signature -from typing import Dict, Tuple, Any, Union +from typing import Any, Dict, Tuple, Union import numpy as np from golem.core.log import default_log -from fedot.core.composer.meta_rules import get_cv_folds_number, get_recommended_preset, \ - get_early_stopping_generations +from fedot.core.composer.meta_rules import get_cv_folds_number, get_early_stopping_generations, get_recommended_preset from fedot.core.data.data import InputData from fedot.core.data.data_preprocessing import find_categorical_columns from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum - meta_rules = [get_cv_folds_number, get_recommended_preset, get_early_stopping_generations] diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index bf06dbf87b..63eac97fd0 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -2,14 +2,12 @@ import glob import os - from copy import copy, deepcopy from dataclasses import dataclass, field from typing import Any, Iterable, List, Optional, Tuple, Union import numpy as np import pandas as pd - from golem.core.log import default_log from golem.utilities.requirements_notificator import warn_requirement diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index c32adf36ff..dccbe12803 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -1,9 +1,9 @@ -from typing import Tuple, Optional +from typing import Optional, Tuple import numpy as np import pandas as pd -from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts +from fedot.core.data.data import InputData, data_type_is_multi_ts, data_type_is_table, data_type_is_ts from fedot.core.repository.dataset_types import DataTypesEnum from fedot.preprocessing.data_types import TYPE_TO_ID diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py index 6a4c747e4a..6b5c414c46 100644 --- a/fedot/core/data/merge/supplementary_data_merger.py +++ b/fedot/core/data/merge/supplementary_data_merger.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import Dict, List import numpy as np from golem.core.log import default_log diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py index 8a053be9a9..08c5509a6f 100644 --- a/fedot/core/data/supplementary_data.py +++ b/fedot/core/data/supplementary_data.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional, Dict +from typing import Dict, Optional import numpy as np diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 5d2993417b..0888843268 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -2,7 +2,6 @@ from typing import List, Optional import numpy as np - from sklearn.preprocessing import LabelEncoder, OneHotEncoder from fedot.core.data.data import InputData, OutputData diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py index 8444b4eaf0..fa880ae7fd 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from fedot.core.data.data import OutputData, InputData +from fedot.core.data.data import InputData, OutputData from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \ DataOperationImplementation from fedot.core.operations.operation_parameters import OperationParameters diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index df87b7b1b5..4037cbc89e 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -3,13 +3,13 @@ import numpy as np import pandas as pd -from sklearn.decomposition import KernelPCA, PCA, FastICA +from sklearn.decomposition import FastICA, KernelPCA, PCA from sklearn.impute import SimpleImputer from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler -from fedot.core.data.data import InputData, data_type_is_table, OutputData -from fedot.core.data.data_preprocessing import replace_inf_with_nans, convert_into_column, \ - divide_data_categorical_numerical, find_categorical_columns, data_has_categorical_features +from fedot.core.data.data import InputData, OutputData, data_type_is_table +from fedot.core.data.data_preprocessing import convert_into_column, data_has_categorical_features, \ + divide_data_categorical_numerical, find_categorical_columns, replace_inf_with_nans from fedot.core.operations.evaluation.operation_implementations. \ implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation from fedot.core.operations.operation_parameters import OperationParameters diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 6e4703a6a5..aeb3e44790 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -5,7 +5,7 @@ import numpy as np from golem.core.log import default_log -from fedot.core.data.data import OutputData, InputData +from fedot.core.data.data import InputData, OutputData from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum from fedot.utilities.custom_errors import AbstractMethodNotImplementError diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py index 3d4ae4134d..da44065277 100644 --- a/fedot/core/operations/operation.py +++ b/fedot/core/operations/operation.py @@ -1,5 +1,5 @@ from abc import abstractmethod -from typing import Optional, Union, Dict, Any +from typing import Any, Dict, Optional, Union from golem.core.log import default_log from golem.serializers.serializer import register_serializable @@ -120,6 +120,7 @@ def predict_for_fit(self, fitted_operation, data: InputData, params: Optional[Op def _predict(self, fitted_operation, data: InputData, params: Optional[OperationParameters] = None, output_mode: str = 'default', is_fit_stage: bool = False): + is_main_target = data.supplementary_data.is_main_target data_flow_length = data.supplementary_data.data_flow_length self._init(data.task, output_mode=output_mode, params=params, n_samples_data=data.features.shape[0]) diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py index ba4483ce0e..9473fd03c4 100644 --- a/fedot/core/repository/json_evaluation.py +++ b/fedot/core/repository/json_evaluation.py @@ -1,5 +1,5 @@ from importlib import import_module -from typing import Union, TYPE_CHECKING, List +from typing import List, TYPE_CHECKING, Union # imports are required beneath in the function from fedot.core.repository.dataset_types import DataTypesEnum diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py index 7e42d95e60..6555a35242 100644 --- a/fedot/core/repository/operation_types_repository.py +++ b/fedot/core/repository/operation_types_repository.py @@ -2,13 +2,13 @@ import os from collections import defaultdict from dataclasses import dataclass -from typing import Dict, List, Optional, Union, TYPE_CHECKING +from typing import Dict, List, Optional, TYPE_CHECKING, Union import numpy as np from golem.core.log import default_log from golem.utilities.data_structures import ensure_wrapped_in_sequence -from fedot.core.constants import BEST_QUALITY_PRESET_NAME, AUTO_PRESET_NAME +from fedot.core.constants import AUTO_PRESET_NAME, BEST_QUALITY_PRESET_NAME from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.json_evaluation import import_enums_from_str, import_strategy_from_str, read_field from fedot.core.repository.tasks import Task, TaskTypesEnum diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index ae0ef29140..4c9de6cf5c 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Union, TYPE_CHECKING +from typing import Dict, Union, TYPE_CHECKING import numpy as np from sklearn.preprocessing import LabelEncoder @@ -168,6 +168,7 @@ def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModa def restore_index(self, input_data: InputData, result: OutputData) -> OutputData: """ restores index from ``input_data`` into ``result`` + Args: input_data: data to take the index from result: data to store index into diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index c0ea6913eb..5cde088d7a 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -6,7 +6,7 @@ from fedot.core.data.data import InputData from fedot.core.data.data_preprocessing import find_categorical_columns -from fedot.preprocessing.data_types import TYPE_TO_ID, FEDOT_STR_NAN +from fedot.preprocessing.data_types import FEDOT_STR_NAN, TYPE_TO_ID class BinaryCategoricalPreprocessor: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 96278bf0b3..32d5f7e323 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -1,10 +1,11 @@ from __future__ import annotations from collections.abc import Sequence -from typing import TYPE_CHECKING, Tuple, Optional, List, Dict +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple import numpy as np import pandas as pd + from golem.core.log import LoggerAdapter, default_log from fedot.core.repository.tasks import Task, TaskTypesEnum diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py index 36b76a390c..d3c4206e34 100644 --- a/fedot/preprocessing/dummy_preprocessing.py +++ b/fedot/preprocessing/dummy_preprocessing.py @@ -1,4 +1,4 @@ -from typing import Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Union import numpy as np from golem.core.log import default_log diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 6292cc786c..95985539d0 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -1,5 +1,5 @@ from copy import copy -from typing import Union, Optional +from typing import Optional, Union import numpy as np import pandas as pd @@ -8,7 +8,7 @@ from sklearn.preprocessing import LabelEncoder from fedot.core.data.data import InputData, np_datetime_to_numeric -from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_ts, data_type_is_text +from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts from fedot.core.data.data_preprocessing import ( data_has_categorical_features, data_has_missing_values, @@ -28,7 +28,7 @@ from fedot.core.repository.tasks import TaskTypesEnum from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor -from fedot.preprocessing.data_type_check import exclude_ts, exclude_multi_ts, exclude_image +from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer diff --git a/test/integration/models/test_repository.py b/test/integration/models/test_repository.py index 39eee3afd1..fd63299531 100644 --- a/test/integration/models/test_repository.py +++ b/test/integration/models/test_repository.py @@ -7,7 +7,7 @@ from fedot.core.repository.operation_types_repository import (OperationTypesRepository, get_operation_type_from_id) from fedot.core.repository.pipeline_operation_repository import PipelineOperationRepository -from fedot.core.repository.tasks import TaskTypesEnum, Task +from fedot.core.repository.tasks import Task, TaskTypesEnum def mocked_path(): diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 9cc3027ebb..856f59f40d 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -7,13 +7,13 @@ from fedot.core.pipelines.node import PipelineNode from fedot.core.pipelines.pipeline import Pipeline from fedot.core.repository.dataset_types import DataTypesEnum -from fedot.core.repository.tasks import TaskTypesEnum, Task +from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.utils import fedot_project_root from fedot.preprocessing.data_types import TYPE_TO_ID from fedot.preprocessing.data_types import TableTypesCorrector, apply_type_transformation from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME -from test.unit.preprocessing.test_pipeline_preprocessing import data_with_mixed_types_in_each_column, \ - correct_preprocessing_params +from test.unit.preprocessing.test_pipeline_preprocessing import correct_preprocessing_params, \ + data_with_mixed_types_in_each_column def get_mixed_data_with_str_and_float_values(idx: int = None): From 8c793bbe99f019c3d6def1c07b7c6444b4d09eb5 Mon Sep 17 00:00:00 2001 From: Sergei Pakulin Date: Fri, 1 Sep 2023 16:24:39 +0500 Subject: [PATCH 57/72] typings --- .../data_operations/categorical_encoders.py | 12 ++++++------ fedot/core/operations/operation.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 0888843268..dce9296c12 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -22,10 +22,10 @@ def __init__(self, params: Optional[OperationParameters] = None): 'handle_unknown': 'ignore' } self.encoder = OneHotEncoder(**{**default_params, **self.params.to_dict()}) - self.categorical_ids = None - self.non_categorical_ids = None - self.encoded_ids = None - self.new_numerical_idx = None + self.categorical_ids: List[int] = [] + self.non_categorical_ids: List[int] = [] + self.encoded_ids: List[int] = [] + self.new_numerical_idx: List[int] = [] def fit(self, input_data: InputData): """ Method for fit encoder with automatic determination of categorical features @@ -104,8 +104,8 @@ def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) # LabelEncoder has no parameters self.encoders = {} - self.categorical_ids: List[int] = None - self.non_categorical_ids: List[int] = None + self.categorical_ids: List[int] = [] + self.non_categorical_ids: List[int] = [] def fit(self, input_data: InputData): feature_type_ids = input_data.supplementary_data.col_type_ids['features'] diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py index da44065277..3625425c7c 100644 --- a/fedot/core/operations/operation.py +++ b/fedot/core/operations/operation.py @@ -26,7 +26,7 @@ def __init__(self, operation_type: str, **kwargs): self.operation_type = operation_type self._eval_strategy = None - self.operations_repo: OperationTypesRepository = None + self.operations_repo: Optional[OperationTypesRepository] = None self.fitted_operation = None self.log = default_log(self) From ac1a57724037dc9edc37045c139063415f0d262d Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 27 Nov 2023 17:19:59 +0300 Subject: [PATCH 58/72] Adding preprocessing data at once from API --- fedot/api/api_utils/api_data.py | 54 +++++++++++++++++++++++++++ fedot/api/main.py | 10 ++++- fedot/core/data/supplementary_data.py | 2 + fedot/core/pipelines/pipeline.py | 12 ++++-- fedot/core/utils.py | 11 ++++++ 5 files changed, 85 insertions(+), 4 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index ff447fe41a..880d64dc92 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -1,7 +1,10 @@ +import sys +from datetime import datetime from typing import Dict, Union from typing import Optional import numpy as np +from golem.core.log import default_log from fedot.api.api_utils.data_definition import data_strategy_selector, FeaturesType, TargetType from fedot.core.data.data import InputData, OutputData, data_type_is_table @@ -10,6 +13,7 @@ from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.ts_wrappers import in_sample_ts_forecast, convert_forecast_to_output from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.core.utils import convert_memory_size from fedot.preprocessing.dummy_preprocessing import DummyPreprocessor from fedot.preprocessing.preprocessing import DataPreprocessor @@ -39,6 +43,8 @@ def __init__(self, task: Task, use_input_preprocessing: bool = True): self._recommendations = {'cut': self.preprocessor.cut_dataset, 'label_encoded': self.preprocessor.label_encoding_for_fit} + self.log = default_log(self) + def define_data(self, features: FeaturesType, target: Optional[TargetType] = None, @@ -123,3 +129,51 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod for name, rec in recommendations.items(): # Apply desired preprocessing function self._recommendations[name](input_data, *rec.values()) + + def fit_transform(self, train_data: InputData) -> InputData: + start_time = datetime.now() + self.log.message('Preprocessing data') + memory_usage = convert_memory_size(sys.getsizeof(train_data)) + features_shape = train_data.features.shape + target_shape = train_data.target.shape + self.log.message( + f'Train Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') + + train_data = self.preprocessor.obligatory_prepare_for_fit(data=train_data) + train_data = self.preprocessor.optional_prepare_for_fit(pipeline=Pipeline(), data=train_data) + train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data) + train_data.supplementary_data.is_auto_preprocessed = True + + memory_usage = convert_memory_size(sys.getsizeof(train_data)) + features_shape = train_data.features.shape + target_shape = train_data.target.shape + self.log.message( + f'Train Data (Processed) Memory Usage: {memory_usage} Data Shape: {features_shape, target_shape}') + self.log.message(f'Data preprocessing runtime = {datetime.now() - start_time}') + + return train_data + + def transform(self, test_data: InputData) -> InputData: + start_time = datetime.now() + self.log.message('Preprocessing data') + memory_usage = convert_memory_size(sys.getsizeof(test_data)) + features_shape = test_data.features.shape + target_shape = test_data.target.shape + self.log.message( + f'Test Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') + + test_data = self.preprocessor.obligatory_prepare_for_predict(data=test_data) + test_data = self.preprocessor.optional_prepare_for_predict(pipeline=Pipeline(), data=test_data) + test_data = self.preprocessor.convert_indexes_for_predict(pipeline=Pipeline(), data=test_data) + test_data = self.preprocessor.update_indices_for_time_series(test_data) + test_data.supplementary_data.is_auto_preprocessed = True + + memory_usage = convert_memory_size(sys.getsizeof(test_data)) + features_shape = test_data.features.shape + target_shape = test_data.target.shape + self.log.message( + f'Test Data (Processed) Memory Usage: {memory_usage} Data Shape: {features_shape, target_shape}') + self.log.message(f'Data preprocessing runtime = {datetime.now() - start_time}') + + return test_data + diff --git a/fedot/api/main.py b/fedot/api/main.py index f1e6718adc..5b7fed8e17 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -86,6 +86,7 @@ def __init__(self, logging_level: int = logging.ERROR, safe_mode: bool = False, n_jobs: int = -1, + auto_preprocessing: bool = False, **composer_tuner_params ): @@ -101,6 +102,7 @@ def __init__(self, self.api_composer = ApiComposer(self.params, self.metrics) + self.auto_preprocessing = auto_preprocessing # Initialize data processors for data preprocessing and preliminary data analysis self.data_processor = ApiDataProcessor(task=self.params.task, use_input_preprocessing=self.params.get('use_input_preprocessing')) @@ -156,6 +158,9 @@ def fit(self, self._init_remote_if_necessary() + if self.auto_preprocessing: + self.train_data = self.data_processor.fit_transform(self.train_data) + if predefined_model is not None: # Fit predefined model and return it without composing self.current_pipeline = PredefinedModel(predefined_model, self.train_data, self.log, @@ -258,6 +263,9 @@ def predict(self, self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True) self._is_in_sample_prediction = in_sample + if self.auto_preprocessing: + self.test_data = self.data_processor.transform(self.test_data) + self.prediction = self.data_processor.define_predictions(current_pipeline=self.current_pipeline, test_data=self.test_data, in_sample=self._is_in_sample_prediction, @@ -521,4 +529,4 @@ def _train_pipeline_on_full_dataset(self, recommendations: Optional[dict], self.current_pipeline.fit( full_train_not_preprocessed, n_jobs=self.params.n_jobs - ) + ) \ No newline at end of file diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py index 08c5509a6f..77943a28e6 100644 --- a/fedot/core/data/supplementary_data.py +++ b/fedot/core/data/supplementary_data.py @@ -29,6 +29,8 @@ class SupplementaryData: non_int_idx: Optional[list] = None # Dictionary with features and target column type numeric identificators col_type_ids: Optional[Dict[str, np.ndarray]] = None + # Was the data preprocessed before composer + is_auto_preprocessed: bool = False @property def compound_mask(self): diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py index c5a727108c..9ea6f503e1 100644 --- a/fedot/core/pipelines/pipeline.py +++ b/fedot/core/pipelines/pipeline.py @@ -184,7 +184,10 @@ def fit(self, input_data: Union[InputData, MultiModalData], """ self.replace_n_jobs_in_nodes(n_jobs) - copied_input_data = self._preprocess(input_data) + if input_data.supplementary_data.is_auto_preprocessed: + copied_input_data = deepcopy(input_data) + else: + copied_input_data = self._preprocess(input_data) copied_input_data = self._assign_data_to_nodes(copied_input_data) if time_constraint is None: @@ -268,8 +271,11 @@ def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str self.log.error(ex) raise ValueError(ex) - # Make copy of the input data to avoid performing inplace operations - copied_input_data = self._preprocess(input_data, is_fit_stage=False) + if input_data.supplementary_data.is_auto_preprocessed: + copied_input_data = deepcopy(input_data) + else: + # Make copy of the input data to avoid performing inplace operations + copied_input_data = self._preprocess(input_data, is_fit_stage=False) copied_input_data = self._assign_data_to_nodes(copied_input_data) result = self.root_node.predict(input_data=copied_input_data, output_mode=output_mode) diff --git a/fedot/core/utils.py b/fedot/core/utils.py index 044e5b2446..dd87cdc431 100644 --- a/fedot/core/utils.py +++ b/fedot/core/utils.py @@ -1,3 +1,4 @@ +import math import os import platform import random @@ -131,3 +132,13 @@ def df_to_html(df: pd.DataFrame, save_path: Union[str, os.PathLike], name: str = if table.parent.name != 'div': table = table.wrap(doc.new_tag('div', style='overflow: auto;')) file.write_text(doc.prettify()) + + +def convert_memory_size(size_bytes): + if size_bytes == 0: + return "0B" + size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + i = int(math.floor(math.log(size_bytes, 1024))) + p = math.pow(1024, i) + s = round(size_bytes / p, 2) + return "%s %s" % (s, size_name[i]) \ No newline at end of file From 3084851826957ecf641c0545101fcad7270c1215 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 27 Nov 2023 21:20:07 +0300 Subject: [PATCH 59/72] Fixes in params, data preprocessor merging and fixes in tests --- fedot/api/api_utils/api_data.py | 10 +++--- fedot/api/main.py | 15 ++++----- fedot/core/data/data_preprocessing.py | 11 ++++--- fedot/preprocessing/base_preprocessing.py | 37 ++++++++++++++++++----- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 880d64dc92..0dbf39036f 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -133,7 +133,7 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod def fit_transform(self, train_data: InputData) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(sys.getsizeof(train_data)) + memory_usage = convert_memory_size(sys.getsizeof(train_data.features)) features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -144,7 +144,7 @@ def fit_transform(self, train_data: InputData) -> InputData: train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data) train_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(sys.getsizeof(train_data)) + memory_usage = convert_memory_size(sys.getsizeof(train_data.features)) features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -153,7 +153,7 @@ def fit_transform(self, train_data: InputData) -> InputData: return train_data - def transform(self, test_data: InputData) -> InputData: + def transform(self, test_data: InputData, current_pipeline) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') memory_usage = convert_memory_size(sys.getsizeof(test_data)) @@ -163,8 +163,8 @@ def transform(self, test_data: InputData) -> InputData: f'Test Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') test_data = self.preprocessor.obligatory_prepare_for_predict(data=test_data) - test_data = self.preprocessor.optional_prepare_for_predict(pipeline=Pipeline(), data=test_data) - test_data = self.preprocessor.convert_indexes_for_predict(pipeline=Pipeline(), data=test_data) + test_data = self.preprocessor.optional_prepare_for_predict(pipeline=current_pipeline, data=test_data) + test_data = self.preprocessor.convert_indexes_for_predict(pipeline=current_pipeline, data=test_data) test_data = self.preprocessor.update_indices_for_time_series(test_data) test_data.supplementary_data.is_auto_preprocessed = True diff --git a/fedot/api/main.py b/fedot/api/main.py index 5b7fed8e17..7627c462a7 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -86,7 +86,6 @@ def __init__(self, logging_level: int = logging.ERROR, safe_mode: bool = False, n_jobs: int = -1, - auto_preprocessing: bool = False, **composer_tuner_params ): @@ -102,7 +101,6 @@ def __init__(self, self.api_composer = ApiComposer(self.params, self.metrics) - self.auto_preprocessing = auto_preprocessing # Initialize data processors for data preprocessing and preliminary data analysis self.data_processor = ApiDataProcessor(task=self.params.task, use_input_preprocessing=self.params.get('use_input_preprocessing')) @@ -158,7 +156,7 @@ def fit(self, self._init_remote_if_necessary() - if self.auto_preprocessing: + if self.params.get('use_input_preprocessing'): self.train_data = self.data_processor.fit_transform(self.train_data) if predefined_model is not None: @@ -180,9 +178,12 @@ def fit(self, else: self.log.message('Already fitted initial pipeline is used') - # Store data encoder in the pipeline if it is required + # Merge API & pipelines encoders if it is required self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( - self.data_processor.preprocessor, self.current_pipeline.preprocessor) + api_preprocessor=self.data_processor.preprocessor, + pipeline_preprocessor=self.current_pipeline.preprocessor, + use_input_preprocessing=self.params.get('use_input_preprocessing') + ) self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}') @@ -263,8 +264,8 @@ def predict(self, self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True) self._is_in_sample_prediction = in_sample - if self.auto_preprocessing: - self.test_data = self.data_processor.transform(self.test_data) + if self.params.get('use_input_preprocessing'): + self.test_data = self.data_processor.transform(self.test_data, self.current_pipeline) self.prediction = self.data_processor.define_predictions(current_pipeline=self.current_pipeline, test_data=self.test_data, diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index dccbe12803..c8f9fd383a 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -98,15 +98,16 @@ def data_has_categorical_features(data: InputData) -> bool: if data.data_type is not DataTypesEnum.table: return False - column_type_ids = data.supplementary_data.col_type_ids['features'] - cat_ids, non_cat_ids = find_categorical_columns(data.features, column_type_ids) - data_has_categorical_columns = len(cat_ids) > 0 + feature_type_ids = data.supplementary_data.col_type_ids['features'] + cat_ids, non_cat_ids = find_categorical_columns(data.features, feature_type_ids) data.numerical_idx = non_cat_ids data.categorical_idx = cat_ids - data.categorical_features = data.subset_features(cat_ids).features - return data_has_categorical_columns + if len(cat_ids) > 0: + data.categorical_features = data.subset_features(cat_ids).features + + return bool(cat_ids) def data_has_text_features(data: InputData) -> bool: diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index 4c9de6cf5c..df728fa1f3 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -211,7 +211,9 @@ def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligator @staticmethod def merge_preprocessors(api_preprocessor: 'BasePreprocessor', - pipeline_preprocessor: 'BasePreprocessor') -> 'BasePreprocessor': + pipeline_preprocessor: 'BasePreprocessor', + use_input_preprocessing: bool, + ) -> 'BasePreprocessor': """ Combines two preprocessor's objects. @@ -222,11 +224,32 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor', Returns: merged preprocessor """ - # Take all obligatory data preprocessing from API - new_data_preprocessor = api_preprocessor + # If was used auto preprocessor + if use_input_preprocessing: + # Take all obligatory data preprocessing from obtained pipelines + new_data_preprocessor = pipeline_preprocessor + + # Update optional preprocessing (take it from API preprocessor) + if not new_data_preprocessor.features_encoders: + # Store features encoder from API preprocessor because there are no encoding in obtained pipelines + new_data_preprocessor.features_encoders = api_preprocessor.features_encoders + + if not new_data_preprocessor.features_imputers: + # Same with Nan's imputers + new_data_preprocessor.features_imputers = api_preprocessor.features_imputers + + # If was used pipelines preprocessors + else: + # Take all obligatory data preprocessing from API + new_data_preprocessor = api_preprocessor + + # Update optional preprocessing (take it from obtained pipeline) + if not new_data_preprocessor.features_encoders: + # Store features encoder from obtained pipeline because in API there are no encoding + new_data_preprocessor.features_encoders = pipeline_preprocessor.features_encoders + + if not new_data_preprocessor.features_imputers: + # Same with Nan's imputers + new_data_preprocessor.features_imputers = pipeline_preprocessor.features_imputers - # Update optional preprocessing (take it from obtained pipeline) - if not new_data_preprocessor.features_encoders: - # Store features encoder from obtained pipeline because in API there are no encoding - new_data_preprocessor.features_encoders = pipeline_preprocessor.features_encoders return new_data_preprocessor From adaf590e99985d53c926143345c92a00d2d7f57f Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 28 Nov 2023 15:26:02 +0300 Subject: [PATCH 60/72] Fixes for MultiModalData --- fedot/core/pipelines/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py index 9ea6f503e1..9f874f4fcc 100644 --- a/fedot/core/pipelines/pipeline.py +++ b/fedot/core/pipelines/pipeline.py @@ -184,7 +184,7 @@ def fit(self, input_data: Union[InputData, MultiModalData], """ self.replace_n_jobs_in_nodes(n_jobs) - if input_data.supplementary_data.is_auto_preprocessed: + if isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed: copied_input_data = deepcopy(input_data) else: copied_input_data = self._preprocess(input_data) @@ -271,7 +271,7 @@ def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str self.log.error(ex) raise ValueError(ex) - if input_data.supplementary_data.is_auto_preprocessed: + if isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed: copied_input_data = deepcopy(input_data) else: # Make copy of the input data to avoid performing inplace operations From 097c1633bec48f704518e7cc6deff33fa5fc7401 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 28 Nov 2023 16:34:54 +0300 Subject: [PATCH 61/72] Added new api param, fix in merge, fixes & editing tests --- fedot/api/api_utils/api_params_repository.py | 3 +- fedot/api/main.py | 8 ++-- fedot/preprocessing/base_preprocessing.py | 18 ++++----- .../test_preprocessing_through_api.py | 40 ++++++++++++++----- 4 files changed, 46 insertions(+), 23 deletions(-) diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py index d71e300685..b2ca5d612d 100644 --- a/fedot/api/api_utils/api_params_repository.py +++ b/fedot/api/api_utils/api_params_repository.py @@ -18,7 +18,7 @@ class ApiParamsRepository: COMPOSER_REQUIREMENTS_KEYS = {'max_arity', 'max_depth', 'num_of_generations', 'early_stopping_iterations', 'early_stopping_timeout', - 'parallelization_mode', 'use_input_preprocessing', + 'parallelization_mode', 'use_input_preprocessing', 'use_auto_preprocessing', 'show_progress', 'collect_intermediate_metric', 'keep_n_best', 'keep_history', 'history_dir', 'cv_folds'} @@ -62,6 +62,7 @@ def default_params_for_task(task_type: TaskTypesEnum) -> dict: use_pipelines_cache=True, use_preprocessing_cache=True, use_input_preprocessing=True, + use_auto_preprocessing=False, use_meta_rules=False, cache_dir=default_fedot_data_dir(), keep_history=True, diff --git a/fedot/api/main.py b/fedot/api/main.py index 7627c462a7..b28d3dc8c7 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -142,7 +142,7 @@ def fit(self, self.train_data = self.data_processor.define_data(features=features, target=target, is_predict=False) self.params.update_available_operations_by_preset(self.train_data) - if self.params.get('use_input_preprocessing'): + if self.params.get('use_auto_preprocessing'): # Launch data analyser - it gives recommendations for data preprocessing recommendations_for_data, recommendations_for_params = \ self.data_analyser.give_recommendations(input_data=self.train_data, @@ -156,7 +156,7 @@ def fit(self, self._init_remote_if_necessary() - if self.params.get('use_input_preprocessing'): + if isinstance(self.train_data, InputData) and self.params.get('use_auto_preprocessing'): self.train_data = self.data_processor.fit_transform(self.train_data) if predefined_model is not None: @@ -182,7 +182,7 @@ def fit(self, self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( api_preprocessor=self.data_processor.preprocessor, pipeline_preprocessor=self.current_pipeline.preprocessor, - use_input_preprocessing=self.params.get('use_input_preprocessing') + use_input_preprocessing=self.params.get('use_auto_preprocessing') ) self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}') @@ -264,7 +264,7 @@ def predict(self, self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True) self._is_in_sample_prediction = in_sample - if self.params.get('use_input_preprocessing'): + if isinstance(self.test_data, InputData) and self.params.get('use_auto_preprocessing'): self.test_data = self.data_processor.transform(self.test_data, self.current_pipeline) self.prediction = self.data_processor.define_predictions(current_pipeline=self.current_pipeline, diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index df728fa1f3..556cff6182 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -227,16 +227,16 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor', # If was used auto preprocessor if use_input_preprocessing: # Take all obligatory data preprocessing from obtained pipelines - new_data_preprocessor = pipeline_preprocessor - - # Update optional preprocessing (take it from API preprocessor) - if not new_data_preprocessor.features_encoders: - # Store features encoder from API preprocessor because there are no encoding in obtained pipelines - new_data_preprocessor.features_encoders = api_preprocessor.features_encoders + new_data_preprocessor = api_preprocessor - if not new_data_preprocessor.features_imputers: - # Same with Nan's imputers - new_data_preprocessor.features_imputers = api_preprocessor.features_imputers + # # Update optional preprocessing (take it from API preprocessor) + # if not new_data_preprocessor.features_encoders: + # # Store features encoder from API preprocessor because there are no encoding in obtained pipelines + # new_data_preprocessor.features_encoders = api_preprocessor.features_encoders + # + # if not new_data_preprocessor.features_imputers: + # # Same with Nan's imputers + # new_data_preprocessor.features_imputers = api_preprocessor.features_imputers # If was used pipelines preprocessors else: diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py index a4d0e83fd8..6e42ee0975 100644 --- a/test/unit/preprocessing/test_preprocessing_through_api.py +++ b/test/unit/preprocessing/test_preprocessing_through_api.py @@ -16,7 +16,7 @@ def data_with_only_categorical_features(): features = np.array([["'a'", "0", "1"], ["'b'", "1", "0"], ["'c'", "1", "0"]], dtype=object) - input_data = InputData(idx=[0, 1, 2], features=features, + input_data = InputData(idx=np.array([0, 1, 2]), features=features, target=np.array([0, 1, 2]), task=task, data_type=DataTypesEnum.table, supplementary_data=supp_data) @@ -41,7 +41,7 @@ def data_with_too_much_nans(): [9, '1', np.inf], [8, np.nan, np.inf]], dtype=object) target = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]) - train_input = InputData(idx=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), features=features, target=target, task=task, data_type=DataTypesEnum.table, supplementary_data=SupplementaryData()) @@ -61,7 +61,7 @@ def data_with_spaces_and_nans_in_features(): ['0 ', ' 1'], ['1 ', ' 0']], dtype=object) target = np.array([[0], [1], [2], [3], [4], [5]]) - train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5]), features=features, target=target, task=task, data_type=DataTypesEnum.table, supplementary_data=SupplementaryData()) @@ -78,7 +78,7 @@ def data_with_nans_in_target_column(): [3, 4], [1, 3]]) target = np.array([[0], [1], [np.nan], [np.nan], [4], [5]]) - train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5]), features=features, target=target, task=task, data_type=DataTypesEnum.table, supplementary_data=SupplementaryData()) @@ -98,7 +98,7 @@ def data_with_nans_in_multi_target(): [3, 4], [1, 3]]) target = np.array([[0, 2], [1, 3], [np.nan, np.nan], [3, np.nan], [4, 4], [5, 6]]) - train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5]), features=features, target=target, task=task, data_type=DataTypesEnum.table, supplementary_data=SupplementaryData()) @@ -123,7 +123,7 @@ def data_with_categorical_target(with_nan: bool = False): target = np.array(['blue', np.nan, np.nan, 'di'], dtype=object) else: target = np.array(['blue', 'da', 'ba', 'di'], dtype=str) - train_input = InputData(idx=[0, 1, 2, 3], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3]), features=features, target=target, task=task, data_type=DataTypesEnum.table, supplementary_data=SupplementaryData()) @@ -140,7 +140,7 @@ def data_with_text_features(): dtype=object) target = np.array([[0], [1], [0], [1]]) - train_input = InputData(idx=[0, 1, 2, 3], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3]), features=features, target=target, task=task, data_type=DataTypesEnum.text, supplementary_data=SupplementaryData()) @@ -159,7 +159,7 @@ def data_with_pseudo_text_features(): dtype=object) target = np.array([[0], [1], [0], [1], [0]]) - train_input = InputData(idx=[0, 1, 2, 3, 4], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3, 4]), features=features, target=target, task=task, data_type=DataTypesEnum.table, supplementary_data=SupplementaryData()) @@ -177,7 +177,7 @@ def data_with_text_features_and_nans(): dtype=object) target = np.array([[0], [1], [0], [1], [0]]) - train_input = InputData(idx=[0, 1, 2, 3, 4], features=features, + train_input = InputData(idx=np.array([0, 1, 2, 3, 4]), features=features, target=target, task=task, data_type=DataTypesEnum.text, supplementary_data=SupplementaryData()) @@ -250,3 +250,25 @@ def test_correct_api_dataset_with_pseudo_text_preprocessing(): node_tags = [node.tags for node in fedot_model.current_pipeline.nodes] assert not any('text' in current_tags for current_tags in node_tags) assert fedot_model.prediction.features.shape[0] == input_data.features.shape[0] + + +def test_auto_preprocessing_mode(): + funcs = [data_with_only_categorical_features, data_with_too_much_nans, + data_with_spaces_and_nans_in_features, data_with_nans_in_target_column, + data_with_nans_in_multi_target] + + # Check for all datasets + for data_generator in funcs: + input_data = data_generator() + single_processing = Fedot(problem='regression', use_auto_preprocessing=True) + multi_processing = Fedot(problem='regression', use_auto_preprocessing=False) + + pipeline_single = single_processing.fit(input_data, predefined_model='auto') + pipeline_multi = multi_processing.fit(input_data, predefined_model='auto') + + prediction_single = pipeline_single.predict(input_data) + prediction_multi = pipeline_multi.predict(input_data) + + assert prediction_single.features.shape == prediction_multi.features.shape + assert (prediction_single.features == prediction_single.features).all() + assert (prediction_single.predict == prediction_single.predict).all() From 94b6af56c6db138deee3191a93983f06ebcb09e6 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 29 Nov 2023 20:15:15 +0300 Subject: [PATCH 62/72] Fix param for test --- fedot/api/api_utils/api_params_repository.py | 2 +- fedot/api/builder.py | 2 ++ test/unit/api/test_api_params.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py index b2ca5d612d..e1626db0b1 100644 --- a/fedot/api/api_utils/api_params_repository.py +++ b/fedot/api/api_utils/api_params_repository.py @@ -18,7 +18,7 @@ class ApiParamsRepository: COMPOSER_REQUIREMENTS_KEYS = {'max_arity', 'max_depth', 'num_of_generations', 'early_stopping_iterations', 'early_stopping_timeout', - 'parallelization_mode', 'use_input_preprocessing', 'use_auto_preprocessing', + 'parallelization_mode', 'use_input_preprocessing', 'show_progress', 'collect_intermediate_metric', 'keep_n_best', 'keep_history', 'history_dir', 'cv_folds'} diff --git a/fedot/api/builder.py b/fedot/api/builder.py index 449b076833..1a7b7bdf15 100644 --- a/fedot/api/builder.py +++ b/fedot/api/builder.py @@ -330,6 +330,7 @@ def setup_data_preprocessing( safe_mode: bool = DEFAULT_VALUE, use_input_preprocessing: bool = DEFAULT_VALUE, use_preprocessing_cache: bool = DEFAULT_VALUE, + use_auto_preprocessing: bool = DEFAULT_VALUE, ) -> FedotBuilder: """ Sets parameters of input data preprocessing. @@ -351,6 +352,7 @@ def setup_data_preprocessing( safe_mode=safe_mode, use_input_preprocessing=use_input_preprocessing, use_preprocessing_cache=use_preprocessing_cache, + use_auto_preprocessing=use_auto_preprocessing, ) return self diff --git a/test/unit/api/test_api_params.py b/test/unit/api/test_api_params.py index 7295ababa9..f19e96b0d0 100644 --- a/test/unit/api/test_api_params.py +++ b/test/unit/api/test_api_params.py @@ -35,6 +35,7 @@ use_pipelines_cache=True, use_preprocessing_cache=True, use_input_preprocessing=True, + use_auto_preprocessing=False, cache_dir='cache', keep_history=True, history_dir='history', From be007cb768d6887c7b07d7b03e416ee77d1045bb Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 29 Nov 2023 20:34:30 +0300 Subject: [PATCH 63/72] Fix bug in API --- fedot/api/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/api/main.py b/fedot/api/main.py index b28d3dc8c7..a9c42ec1f5 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -142,7 +142,7 @@ def fit(self, self.train_data = self.data_processor.define_data(features=features, target=target, is_predict=False) self.params.update_available_operations_by_preset(self.train_data) - if self.params.get('use_auto_preprocessing'): + if self.params.get('use_input_preprocessing'): # Launch data analyser - it gives recommendations for data preprocessing recommendations_for_data, recommendations_for_params = \ self.data_analyser.give_recommendations(input_data=self.train_data, From 8e046f51abfe2d761b4ca6a2faec1d5de18768af Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 5 Dec 2023 14:54:09 +0300 Subject: [PATCH 64/72] @kasyanovse requested improvements --- fedot/core/data/data_preprocessing.py | 12 +++--------- fedot/preprocessing/preprocessing.py | 6 ++---- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index c8f9fd383a..b0076e9f0a 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -72,15 +72,9 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda def force_categorical_determination(table: np.ndarray): """ Find string columns using 'computationally expensive' approach """ - categorical_ids = [] - non_categorical_ids = [] - # For every column in table make check - for column_id, column in enumerate(table.T): - # Check if column is of string objects - if pd.api.types.infer_dtype(column, skipna=True) == 'string': - categorical_ids.append(column_id) - else: - non_categorical_ids.append(column_id) + real_columns_selector = np.all(np.isreal(table), axis=0) + non_categorical_ids = np.flatnonzero(real_columns_selector).tolist() + categorical_ids = np.flatnonzero(~real_columns_selector).tolist() return categorical_ids, non_categorical_ids diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 95985539d0..b45e2ceabc 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -314,11 +314,9 @@ def _clean_extra_spaces(data: InputData) -> InputData: """ def strip_all_strs(item: Union[object, str]): - try: + if isinstance(item, str): return item.strip() - except AttributeError: - # not an str object - return item + return item features_df = pd.DataFrame(data.features) mixed_or_str = features_df.select_dtypes(object) From cac26f691c002909f599127b69eeac8ea44db946 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 5 Dec 2023 17:08:42 +0300 Subject: [PATCH 65/72] Return fixes --- fedot/core/data/data_preprocessing.py | 14 +++++++++----- fedot/preprocessing/preprocessing.py | 6 ++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index b0076e9f0a..49650e71c7 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -72,11 +72,15 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda def force_categorical_determination(table: np.ndarray): """ Find string columns using 'computationally expensive' approach """ - real_columns_selector = np.all(np.isreal(table), axis=0) - non_categorical_ids = np.flatnonzero(real_columns_selector).tolist() - categorical_ids = np.flatnonzero(~real_columns_selector).tolist() - - return categorical_ids, non_categorical_ids + categorical_ids = [] + non_categorical_ids = [] + # For every column in table make check + for column_id, column in enumerate(table.T): + # Check if column is of string objects + if pd.api.types.infer_dtype(column, skipna=True) == 'string': + categorical_ids.append(column_id) + else: + non_categorical_ids.append(column_id) def data_has_missing_values(data: InputData) -> bool: diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index b45e2ceabc..95985539d0 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -314,9 +314,11 @@ def _clean_extra_spaces(data: InputData) -> InputData: """ def strip_all_strs(item: Union[object, str]): - if isinstance(item, str): + try: return item.strip() - return item + except AttributeError: + # not an str object + return item features_df = pd.DataFrame(data.features) mixed_or_str = features_df.select_dtypes(object) From 5f62ef408834fed5f4a387f2b71ac61aae2346a2 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 6 Dec 2023 15:06:06 +0300 Subject: [PATCH 66/72] Return fixes (1) --- fedot/core/data/data_preprocessing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 49650e71c7..c8f9fd383a 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -82,6 +82,8 @@ def force_categorical_determination(table: np.ndarray): else: non_categorical_ids.append(column_id) + return categorical_ids, non_categorical_ids + def data_has_missing_values(data: InputData) -> bool: """ Check data for missing values.""" From b96214849444dea767df69bceb4c4588cea180c9 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 7 Dec 2023 14:09:19 +0300 Subject: [PATCH 67/72] Remove transformations to str categories --- fedot/api/api_utils/input_analyser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index d626835741..3be6ecc3f9 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -115,5 +115,5 @@ def control_categorical(self, input_data: InputData) -> bool: """ categorical_ids, _ = find_categorical_columns(input_data.features) - uniques = np.unique(input_data.features[:, categorical_ids].astype(str)) + uniques = np.unique(input_data.features[:, categorical_ids]) return len(uniques) > self.max_cat_cardinality From d5b06482e9bac215c30c60f7a55642be452d00e4 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 7 Dec 2023 14:27:57 +0300 Subject: [PATCH 68/72] Return transformations to str for categories --- fedot/api/api_utils/input_analyser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index 3be6ecc3f9..d626835741 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -115,5 +115,5 @@ def control_categorical(self, input_data: InputData) -> bool: """ categorical_ids, _ = find_categorical_columns(input_data.features) - uniques = np.unique(input_data.features[:, categorical_ids]) + uniques = np.unique(input_data.features[:, categorical_ids].astype(str)) return len(uniques) > self.max_cat_cardinality From 8be836e9b545affbdd8ad9cfa834869fa0691e3e Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 7 Dec 2023 17:01:36 +0300 Subject: [PATCH 69/72] Fix control_categorical for label encoder --- fedot/api/api_utils/input_analyser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index d626835741..25dd01fa64 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -77,6 +77,7 @@ def _give_recommendations_for_data(self, input_data: InputData) -> Dict: recommendations_for_data['cut'] = {'border': border} is_label_encoding_needed = self.control_categorical(input_data) if is_label_encoding_needed: + self._log('Switch categorical encoder to label encoder') recommendations_for_data['label_encoded'] = {} return recommendations_for_data @@ -115,5 +116,6 @@ def control_categorical(self, input_data: InputData) -> bool: """ categorical_ids, _ = find_categorical_columns(input_data.features) - uniques = np.unique(input_data.features[:, categorical_ids].astype(str)) - return len(uniques) > self.max_cat_cardinality + # Counts unique categories for each feature, and then counts their number + uniques_cats = sum([len(np.unique(feature)) for feature in input_data.features[:, categorical_ids].astype(str)]) + return uniques_cats > self.max_cat_cardinality From a91c9ba7ee2b2fc58ecd2bdb57ee076857fa4539 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 7 Dec 2023 17:13:05 +0300 Subject: [PATCH 70/72] Fix log message --- fedot/api/api_utils/input_analyser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index 25dd01fa64..61f91770e5 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -77,7 +77,7 @@ def _give_recommendations_for_data(self, input_data: InputData) -> Dict: recommendations_for_data['cut'] = {'border': border} is_label_encoding_needed = self.control_categorical(input_data) if is_label_encoding_needed: - self._log('Switch categorical encoder to label encoder') + self._log.info('Switch categorical encoder to label encoder') recommendations_for_data['label_encoded'] = {} return recommendations_for_data From 304e29fb0d2d62d029ade22bb0ddeca5af2ac479 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 8 Dec 2023 23:01:12 +0300 Subject: [PATCH 71/72] Small fixes with merger --- fedot/api/main.py | 2 +- fedot/preprocessing/base_preprocessing.py | 13 ++----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/fedot/api/main.py b/fedot/api/main.py index a9c42ec1f5..81bbb460da 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -182,7 +182,7 @@ def fit(self, self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( api_preprocessor=self.data_processor.preprocessor, pipeline_preprocessor=self.current_pipeline.preprocessor, - use_input_preprocessing=self.params.get('use_auto_preprocessing') + use_auto_preprocessing=self.params.get('use_auto_preprocessing') ) self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}') diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index 556cff6182..7871af8fc4 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -212,7 +212,7 @@ def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligator @staticmethod def merge_preprocessors(api_preprocessor: 'BasePreprocessor', pipeline_preprocessor: 'BasePreprocessor', - use_input_preprocessing: bool, + use_auto_preprocessing: bool, ) -> 'BasePreprocessor': """ Combines two preprocessor's objects. @@ -225,19 +225,10 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor', merged preprocessor """ # If was used auto preprocessor - if use_input_preprocessing: + if use_auto_preprocessing: # Take all obligatory data preprocessing from obtained pipelines new_data_preprocessor = api_preprocessor - # # Update optional preprocessing (take it from API preprocessor) - # if not new_data_preprocessor.features_encoders: - # # Store features encoder from API preprocessor because there are no encoding in obtained pipelines - # new_data_preprocessor.features_encoders = api_preprocessor.features_encoders - # - # if not new_data_preprocessor.features_imputers: - # # Same with Nan's imputers - # new_data_preprocessor.features_imputers = api_preprocessor.features_imputers - # If was used pipelines preprocessors else: # Take all obligatory data preprocessing from API From 0c48f7ff945c825c9af59c86b51e9d3e3eff5a4b Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 11 Dec 2023 19:52:06 +0300 Subject: [PATCH 72/72] @andreygetmanov requested fixes --- fedot/core/utils.py | 10 +++++----- fedot/preprocessing/preprocessing.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fedot/core/utils.py b/fedot/core/utils.py index dd87cdc431..8e1654e7c9 100644 --- a/fedot/core/utils.py +++ b/fedot/core/utils.py @@ -137,8 +137,8 @@ def df_to_html(df: pd.DataFrame, save_path: Union[str, os.PathLike], name: str = def convert_memory_size(size_bytes): if size_bytes == 0: return "0B" - size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") - i = int(math.floor(math.log(size_bytes, 1024))) - p = math.pow(1024, i) - s = round(size_bytes / p, 2) - return "%s %s" % (s, size_name[i]) \ No newline at end of file + digit_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + integer_size_value = int(math.floor(math.log(size_bytes, 1024))) + byte_digit = math.pow(1024, integer_size_value) + size_in_digit_name = round(size_bytes / byte_digit, 2) + return "%s %s" % (size_in_digit_name, digit_name[integer_size_value]) \ No newline at end of file diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 95985539d0..ac1c165fb4 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -317,7 +317,7 @@ def strip_all_strs(item: Union[object, str]): try: return item.strip() except AttributeError: - # not an str object + # not a str object return item features_df = pd.DataFrame(data.features)