From 8f058909f5fec3f0387a559fa63d5304f81f15b1 Mon Sep 17 00:00:00 2001 From: Pakulin Sergei Date: Fri, 17 Feb 2023 14:29:49 +0300 Subject: [PATCH] optimizations and style fixes --- fedot/core/data/data_preprocessing.py | 6 +-- .../data_operations/categorical_encoders.py | 14 +++--- fedot/core/repository/json_evaluation.py | 4 +- fedot/preprocessing/categorical.py | 50 ++++++------------- fedot/preprocessing/data_types.py | 10 ++-- 5 files changed, 31 insertions(+), 53 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index f9da888840..99404353e8 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool: def replace_inf_with_nans(input_data: InputData): features = input_data.features - has_infs = (features == np.inf) | (features == -np.inf) - if np.any(has_infs): - features[has_infs] = np.nan + inf_idxs: Tuple[np.ndarray, ...] = ((features == np.inf) | (features == -np.inf)).nonzero() + if len(inf_idxs[0]): + features[inf_idxs] = np.nan def replace_nans_with_empty_strings(input_data: InputData): diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 345235e29a..ac43f978c8 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Optional +from typing import Optional, Tuple import numpy as np import pandas as pd @@ -137,9 +137,9 @@ def transform(self, input_data: InputData) -> OutputData: # If categorical features are exists - transform them inplace in InputData for categorical_id in self.categorical_ids: categorical_column = input_data.features[:, categorical_id] - has_nan: np.ndarray = pd.isna(categorical_column) + nan_idxs: Tuple[np.ndarray, ...] = pd.isna(categorical_column).nonzero() - transformed = self._apply_label_encoder(categorical_column, categorical_id, has_nan) + transformed = self._apply_label_encoder(categorical_column, categorical_id, nan_idxs) copied_data.features[:, categorical_id] = transformed output_data = self._convert_to_output(copied_data, @@ -168,21 +168,21 @@ def _fit_label_encoders(self, input_data: InputData): self.encoders.update({categorical_id: le}) def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int, - has_nan: np.ndarray) -> np.ndarray: + nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray: """ Apply fitted LabelEncoder for column transformation :param categorical_column: numpy array with categorical features :param categorical_id: index of current categorical column - :param has_nan: bool array of gap elements in the ``categorical_column`` + :param nan_idxs: indices of gap elements in the ``categorical_column`` """ column_encoder = self.encoders[categorical_id] column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column))) transformed_column = column_encoder.transform(categorical_column) - if len(has_nan) > 0: + if len(nan_idxs[0]): # Store np.nan values transformed_column = transformed_column.astype(object) - transformed_column[has_nan] = np.nan + transformed_column[nan_idxs] = np.nan return transformed_column diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py index 3ab0a96e93..ba4483ce0e 100644 --- a/fedot/core/repository/json_evaluation.py +++ b/fedot/core/repository/json_evaluation.py @@ -1,7 +1,7 @@ from importlib import import_module from typing import Union, TYPE_CHECKING, List -# imports are required for the eval +# imports are required beneath in the function from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum @@ -39,7 +39,7 @@ def import_enums_from_str(field_value: str) -> Union[List[DataTypesEnum], Returns: list of either class:`DataTypesEnum` or class:`TaskTypesEnum` values """ - enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val != ''] + enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val] return [ getattr(globals()[data_type], value) for (data_type, value) in enums] diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 09368286c5..fe5fa1eeaf 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -33,26 +33,23 @@ def fit(self, input_data: InputData): return self binary_ids_to_convert = [] - number_of_columns = input_data.features.shape[-1] - for column_id in range(number_of_columns): - pd_column = pd.Series(input_data.features[:, column_id], copy=True) - has_nan = pd_column.isna() - if has_nan.sum() and column_id in categorical_ids: + for column_id, column in enumerate(input_data.features.T): + pd_column = pd.Series(column, copy=True) + is_nan = pd_column.isna() + column_uniques = pd_column.unique() + if is_nan.sum() and column_id in categorical_ids: # This categorical column has nans - replaced_column, _ = replace_nans_with_fedot_nans(pd_column, has_nan) - column_uniques = replaced_column.unique() + pd_column[is_nan] = FEDOT_STR_NAN if len(column_uniques) <= 3: # There is column with binary categories and gaps self.binary_features_with_nans.append(column_id) binary_ids_to_convert.append(column_id) - self._train_encoder(replaced_column, column_id) + self._train_encoder(pd_column, column_id) else: - column_uniques = pd_column.unique() if len(column_uniques) <= 2 and column_id in categorical_ids: # Column contains binary string feature binary_ids_to_convert.append(column_id) - # Train encoder for current column self._train_encoder(pd_column, column_id) @@ -67,26 +64,15 @@ def transform(self, input_data: InputData) -> InputData: # There are no binary categorical features return input_data - converted_features = [] - number_of_columns = input_data.features.shape[-1] - for column_id in range(number_of_columns): + copied_data = deepcopy(input_data) + for column_id, column in enumerate(copied_data.features.T): if column_id in self.binary_ids_to_convert: # If column contains nans - replace them with fedot nans special string - pd_column = pd.Series(input_data.features[:, column_id]) - has_nan = pd_column.isna() - replaced_column, has_nan = replace_nans_with_fedot_nans(pd_column, has_nan) + nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero() + column[nan_idxs] = FEDOT_STR_NAN # Convert into integers - converted_column = self._apply_encoder(replaced_column, column_id, has_nan) - else: - # Stay column the same - converted_column = input_data.features[:, column_id] - - converted_features.append(converted_column.reshape((-1, 1))) - - # Store transformed features - copied_data = deepcopy(input_data) - copied_data.features = np.hstack(converted_features) + column[:] = self._apply_encoder(column, column_id, nan_idxs) # Update features types features_types = copied_data.supplementary_data.column_types['features'] @@ -117,22 +103,16 @@ def _train_encoder(self, column: pd.Series, column_id: int): # Store fitted label encoder for transform method self.binary_encoders.update({column_id: encoder}) - def _apply_encoder(self, column: pd.Series, column_id: int, has_nan: pd.Series) -> np.ndarray: + def _apply_encoder(self, column: np.ndarray, column_id: int, nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray: """ Apply already fitted encoders """ encoder = self.binary_encoders[column_id] # Extend encoder classes if the column contains categories not previously encountered encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column))) converted = encoder.transform(column) - if len(has_nan) > 0: + if len(nan_idxs[0]): # Column has nans in its structure - after conversion replace it converted = converted.astype(float) - converted[has_nan] = np.nan + converted[nan_idxs] = np.nan return converted - - -def replace_nans_with_fedot_nans(column: pd.Series, has_nan: pd.Series) -> Tuple[pd.Series, pd.Series]: - # Add new category - 'fedot_nan' after converting it will be replaced by nans - column[has_nan] = FEDOT_STR_NAN - return column, has_nan diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 1082ea43e9..0298d456e2 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -217,8 +217,8 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) data.supplementary_data.column_types['features'] = [ - col_type - for col_id, col_type in enumerate(data.supplementary_data.column_types['features']) + col_type_id + for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features']) if col_id not in self.string_columns_transformation_failed ] @@ -429,9 +429,7 @@ def define_column_types(table: np.ndarray): table_of_types[nans] = TYPE_TO_ID[type(None)] columns_info = {} - for column_id in range(n_columns): - col_types = table_of_types[:, column_id] - + for column_id, col_types in enumerate(table_of_types.T): unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True) if len(unique_col_types) > 1: @@ -445,7 +443,7 @@ def define_column_types(table: np.ndarray): ] # Store information about nans in the target - nan_ids = np.where(nans[:, column_id])[0] + nan_ids = np.nonzero(nans[:, column_id])[0] columns_info.update({column_id: {'types': unique_col_types, 'str_number': str_number, 'int_number': int_number,