From f2cacc2f42958d3cbca01681711cf3c052125a83 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 21 Nov 2022 23:14:34 +0300
Subject: [PATCH 01/72] accelerated define_column_types

---
 fedot/preprocessing/data_types.py | 43 ++++++++++---------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 5f8b8557c0..bff1bd649b 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -412,21 +412,14 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
                 features_types[column_id] = NAME_CLASS_FLOAT
 
 
-def define_column_types(table: np.array):
+def define_column_types(table: np.ndarray):
     """ Prepare information about types per columns. For each column store unique
     types, which column contains. If column with mixed type contain str object
     additional field 'str_ids' with indices of string objects is prepared
     """
-
-    # TODO: current processing is relatively computationally expensive - probably refactor needed
-
-    def type_ignoring_nans(item):
-        """ Return type of element in the array. If item is np.nan - return NoneType """
-        current_type = type(item)
-        if current_type is float and np.isnan(item):
-            # Check is current element is nan or not (np.nan is a float type)
-            return type(None)
-        return current_type
+    def to_type(item):
+        return str(type(item))
+    vto_type = np.vectorize(to_type)
 
     if table is None:
         return {}
@@ -436,34 +429,24 @@ def type_ignoring_nans(item):
     for column_id in range(n_columns):
         current_column = table[:, column_id]
 
-        # Check every element in numpy array - it can take a long time!
-        column_types = list(map(type_ignoring_nans, current_column))
-
-        # Store only unique values
-        set_column_types = set(column_types)
-        # Convert types into string names
-        column_types_names = list(map(str, set_column_types))
+        column_types = np.where(pd.isna(current_column), str(type(None)), vto_type(current_column))
 
-        if len(column_types_names) > 1:
-            # There are several types in one column
-            types_names = np.array(column_types, dtype=str)
-            # Calculate number of string objects in the dataset
-            str_number = len(np.argwhere(types_names == NAME_CLASS_STR))
-            int_number = len(np.argwhere(types_names == NAME_CLASS_INT))
-            float_number = len(np.argwhere(types_names == NAME_CLASS_FLOAT))
+        if len(np.unique(column_types)) > 1:
+            str_number = (column_types == NAME_CLASS_STR).sum()
+            int_number = (column_types == NAME_CLASS_INT).sum()
+            float_number = (column_types == NAME_CLASS_FLOAT).sum()
 
             # Store information about nans in the target
-            nan_ids = np.ravel(np.argwhere(types_names == NAME_CLASS_NONE))
-            nan_number = len(nan_ids)
-            columns_info.update({column_id: {'types': column_types_names,
+            nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE))  # TODO: maybe just convert to list to preserve idx pairs?
+            columns_info.update({column_id: {'types': column_types,
                                              'str_number': str_number,
                                              'int_number': int_number,
                                              'float_number': float_number,
-                                             'nan_number': nan_number,
+                                             'nan_number': len(nan_ids),
                                              'nan_ids': nan_ids}})
         else:
             # There is only one type, or several types such as int and float
-            columns_info.update({column_id: {'types': column_types_names}})
+            columns_info.update({column_id: {'types': column_types}})
     return columns_info
 
 

From 4a5a2cfbcc8f65df654359c9bec72288123c1405 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 22 Nov 2022 12:44:54 +0300
Subject: [PATCH 02/72] hotfix for pytests

---
 fedot/preprocessing/data_types.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index bff1bd649b..b426b2267a 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -430,15 +430,16 @@ def to_type(item):
         current_column = table[:, column_id]
 
         column_types = np.where(pd.isna(current_column), str(type(None)), vto_type(current_column))
+        unique_column_types = np.unique(column_types)
 
-        if len(np.unique(column_types)) > 1:
+        if len(unique_column_types) > 1:
             str_number = (column_types == NAME_CLASS_STR).sum()
             int_number = (column_types == NAME_CLASS_INT).sum()
             float_number = (column_types == NAME_CLASS_FLOAT).sum()
 
             # Store information about nans in the target
             nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE))  # TODO: maybe just convert to list to preserve idx pairs?
-            columns_info.update({column_id: {'types': column_types,
+            columns_info.update({column_id: {'types': unique_column_types,
                                              'str_number': str_number,
                                              'int_number': int_number,
                                              'float_number': float_number,
@@ -446,7 +447,7 @@ def to_type(item):
                                              'nan_ids': nan_ids}})
         else:
             # There is only one type, or several types such as int and float
-            columns_info.update({column_id: {'types': column_types}})
+            columns_info.update({column_id: {'types': unique_column_types}})
     return columns_info
 
 

From c0bff917ee9097264bcded33ce2ad17eba2c2063 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 22 Nov 2022 16:49:52 +0300
Subject: [PATCH 03/72] accelerated _clean_extra_spaces

---
 fedot/preprocessing/preprocessing.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 3e1cce5374..d541f94f98 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -357,10 +357,18 @@ def _clean_extra_spaces(data: InputData) -> InputData:
         Returns:
             cleaned ``data``
         """
-        features = pd.DataFrame(data.features)
-        features = features.applymap(lambda x: x.strip() if isinstance(x, str) else x)
-
-        data.features = np.array(features)
+        def strip_all_strs(item: Union[object, str]):
+            try:
+                return item.strip()
+            except AttributeError:
+                # not an str object
+                return item
+
+        features_df = pd.DataFrame(data.features)
+        mixed_or_str = features_df.select_dtypes(object)
+        features_df[mixed_or_str.columns] = mixed_or_str.applymap(strip_all_strs)
+
+        data.features = features_df.to_numpy()
         return data
 
     @copy_doc(BasePreprocessor.label_encoding_for_fit)

From 986d5348954ba1940795adbdbf5fc879d9745538 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 23 Nov 2022 16:55:15 +0300
Subject: [PATCH 04/72] convert num col to str optimized

---
 fedot/preprocessing/data_types.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index b426b2267a..91eeeba6bf 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -494,17 +494,10 @@ def type_by_name(current_type_name: str):
 
 def convert_num_column_into_string_array(numerical_column: pd.Series) -> np.array:
     """ Convert pandas column into numpy one-dimensional array """
-    # Convert into string
-    converted_column = numerical_column.astype(str)
-    converted_array = converted_column.values
-
-    # If there are nans - insert them
-    nan_ids = np.ravel(np.argwhere(converted_array == 'nan'))
-    if len(nan_ids) > 0:
-        converted_array = converted_array.astype(object)
-        converted_array[nan_ids] = np.nan
-
-    return converted_array
+    # convert only non-nans values
+    true_nums = numerical_column[numerical_column.notna()]
+    numerical_column[true_nums.index] = true_nums.astype(str, copy=False)
+    return numerical_column.to_numpy()
 
 
 def _obtain_new_column_type(column_info):

From c594f81235da761ba19ed35134a139411b9b6df3 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 23 Nov 2022 17:46:27 +0300
Subject: [PATCH 05/72] type inference fixes

---
 fedot/preprocessing/data_types.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 91eeeba6bf..4ccb62b4ec 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -9,7 +9,7 @@
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 
 NoneType = type(None)
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Tuple
 
 if TYPE_CHECKING:
     from fedot.core.data.data import InputData
@@ -281,7 +281,7 @@ def _convert_feature_into_one_type(self, mixed_column: np.array, column_info: di
             return None, 'removed'
 
     def _convert_target_into_one_type(self, mixed_column: np.array, column_info: dict, mixed_column_id: int,
-                                      task: Task) -> [np.array, str]:
+                                      task: Task) -> Tuple[np.ndarray, str]:
         """ Convert target columns into one type based on column proportions of object and task """
         if task.task_type is TaskTypesEnum.classification:
             # For classification labels are string if at least one element is a string
@@ -309,7 +309,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         Perform automated categorical features determination. If feature column
         contains int or float values with few unique values (less than 13)
         """
-        n_rows, n_cols = data.features.shape
+        _, n_cols = data.features.shape
         for column_id in range(n_cols):
             # For every int/float column perform check
             column_type = data.supplementary_data.column_types['features'][column_id]
@@ -339,7 +339,7 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
             # There is no transformation for current table
             return data
 
-        n_rows, n_cols = data.features.shape
+        _, n_cols = data.features.shape
         for column_id in range(n_cols):
             if column_id in self.numerical_into_str:
                 numerical_column = pd.Series(data.features[:, column_id])

From 54d10eedb3191b43bbcf7fe0324ed86687c1f2c5 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 24 Nov 2022 18:42:45 +0300
Subject: [PATCH 06/72] categorical.py/data_preprocessing.py refactored

---
 fedot/core/data/data_preprocessing.py | 63 ++++++++++----------------
 fedot/preprocessing/categorical.py    | 65 +++++++++++----------------
 2 files changed, 50 insertions(+), 78 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 6d509f67c2..2bdba0c1ce 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -1,35 +1,35 @@
 import numpy as np
 import pandas as pd
 
+from typing import Tuple, Optional
+
 from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
 from fedot.core.repository.dataset_types import DataTypesEnum
 
 
-def data_type_is_suitable_preprocessing(data: InputData) -> bool:
-    if data_type_is_table(data) or data_type_is_ts(data) or data_type_is_multi_ts(data):
-        return True
-    return False
+def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
+    return data_type_is_table(data) or data_type_is_ts(data) or data_type_is_multi_ts(data)
 
 
 def replace_inf_with_nans(input_data: InputData):
-    values_to_replace = [np.inf, -np.inf]
-    features_with_replaced_inf = np.where(np.isin(input_data.features,
-                                                  values_to_replace),
-                                          np.nan,
-                                          input_data.features)
-    input_data.features = features_with_replaced_inf
+    features = input_data.features
+    if features.dtype == object:
+        print(features[:2])
+    try:
+        features[(features == np.inf) | (features == -np.inf)] = np.nan
+    except Exception as exc:
+        print("PROBLEM DTYPE", features.dtype, exc, features)
+        raise
 
 
 def replace_nans_with_empty_strings(input_data: InputData):
     """
     Replace NaNs with empty strings in input_data.features
     """
-    input_data.features = np.where(pd.isna(input_data.features),
-                                   '',
-                                   input_data.features)
+    input_data.features[pd.isna(input_data.features)] = ''
 
 
-def convert_into_column(array: np.array):
+def convert_into_column(array: np.ndarray) -> np.ndarray:
     """ Perform conversion for data if it is necessary """
     if len(array.shape) == 1:
         return array.reshape(-1, 1)
@@ -38,7 +38,7 @@ def convert_into_column(array: np.array):
 
 
 def divide_data_categorical_numerical(input_data: InputData, categorical_ids: list,
-                                      non_categorical_ids: list) -> (InputData, InputData):
+                                      non_categorical_ids: list) -> Tuple[Optional[InputData], Optional[InputData]]:
     """
     Split tabular InputData into two parts: with numerical and categorical features
     using list with ids of categorical and numerical features.
@@ -65,7 +65,7 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li
         raise ValueError(f'{prefix} Check data for Nans and inf values')
 
 
-def find_categorical_columns(table: np.array, column_types: dict = None):
+def find_categorical_columns(table: np.ndarray, column_types: dict = None):
     """
     Method for finding categorical and non-categorical columns in tabular data
 
@@ -89,29 +89,16 @@ def find_categorical_columns(table: np.array, column_types: dict = None):
     return categorical_ids, non_categorical_ids
 
 
-def force_categorical_determination(table):
+def force_categorical_determination(table: np.ndarray):
     """ Find string columns using 'computationally expensive' approach """
-    source_shape = table.shape
-    columns_number = source_shape[1] if len(source_shape) > 1 else 1
-
     categorical_ids = []
     non_categorical_ids = []
-    # For every column in table make check for first element
-    for column_id in range(0, columns_number):
-        column = table[:, column_id] if columns_number > 1 else table
-        col_shape = column.shape
-        for i in column:
-            # Check if element is string object or not until the first appearance
-            if len(col_shape) == 2 and isinstance(i[0], str):
-                # Column looks like [[n], [n], [n]]
-                categorical_ids.append(column_id)
-                break
-            elif len(col_shape) == 1 and isinstance(i, str):
-                # Column [n, n, n]
-                categorical_ids.append(column_id)
-                break
-
-        if column_id not in categorical_ids:
+    # For every column in table make check
+    for column_id, column in enumerate(table.T):
+        # Check if column is of string objects
+        if pd.api.types.infer_dtype(column, skipna=True) == 'string':
+            categorical_ids.append(column_id)
+        else:
             non_categorical_ids.append(column_id)
 
     return categorical_ids, non_categorical_ids
@@ -119,9 +106,7 @@ def force_categorical_determination(table):
 
 def data_has_missing_values(data: InputData) -> bool:
     """ Check data for missing values."""
-    if data_type_is_suitable_preprocessing(data):
-        return pd.DataFrame(data.features).isna().sum().sum() > 0
-    return False
+    return data_type_is_suitable_for_preprocessing(data) and pd.DataFrame(data.features).isna().sum().sum() > 0
 
 
 def data_has_categorical_features(data: InputData) -> bool:
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 13bfdf5fef..1d7ecfc420 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -2,6 +2,8 @@
 
 import numpy as np
 import pandas as pd
+from typing import Tuple
+
 from sklearn.preprocessing import LabelEncoder
 
 from fedot.core.data.data import InputData
@@ -25,39 +27,35 @@ def fit(self, input_data: InputData):
         has str objects. If there are such features - convert it into int
         """
         features_types = input_data.supplementary_data.column_types['features']
-        categorical_ids, non_categorical_ids = find_categorical_columns(table=input_data.features,
-                                                                        column_types=features_types)
+        categorical_ids, _ = find_categorical_columns(table=input_data.features,
+                                                      column_types=features_types)
         if len(categorical_ids) == 0:
             # There is no need to process categorical features
             return self
 
         binary_ids_to_convert = []
         number_of_columns = input_data.features.shape[-1]
-        for column_id, number in enumerate(range(number_of_columns)):
-            column = np.array(input_data.features[:, column_id])
-
-            # Numpy with strings cannot be processed for nans search - so use pandas
-            pd_column = pd.Series(column)
-            is_row_has_nan = pd.isna(pd_column)
-            nans_number = is_row_has_nan.sum()
-            if nans_number > 0 and column_id in categorical_ids:
+        for column_id in range(number_of_columns):
+            pd_column = pd.Series(input_data.features[:, column_id], copy=True)
+            has_nan = pd_column.isna()
+            if has_nan.sum() and column_id in categorical_ids:
                 # This categorical column has nans
-                column, gap_ids = replace_nans_with_fedot_nans(column, is_row_has_nan)
-                column_uniques = np.unique(column)
+                replaced_column, _ = replace_nans_with_fedot_nans(pd_column, has_nan)
+                column_uniques = replaced_column.unique()
 
                 if len(column_uniques) <= 3:
                     # There is column with binary categories and gaps
                     self.binary_features_with_nans.append(column_id)
                     binary_ids_to_convert.append(column_id)
-                    self._train_encoder(column, column_id)
+                    self._train_encoder(replaced_column, column_id)
             else:
-                column_uniques = np.unique(column)
+                column_uniques = pd_column.unique()
                 if len(column_uniques) <= 2 and column_id in categorical_ids:
                     # Column contains binary string feature
                     binary_ids_to_convert.append(column_id)
 
                     # Train encoder for current column
-                    self._train_encoder(column, column_id)
+                    self._train_encoder(pd_column, column_id)
 
         self.binary_ids_to_convert = binary_ids_to_convert
         return self
@@ -72,18 +70,18 @@ def transform(self, input_data: InputData) -> InputData:
 
         converted_features = []
         number_of_columns = input_data.features.shape[-1]
-        for column_id, number in enumerate(range(number_of_columns)):
+        for column_id in range(number_of_columns):
             if column_id in self.binary_ids_to_convert:
                 # If column contains nans - replace them with fedot nans special string
-                column = input_data.features[:, column_id]
-                is_row_has_nan = pd.isna(pd.Series(column))
-                column, gap_ids = replace_nans_with_fedot_nans(column, is_row_has_nan)
+                pd_column = pd.Series(input_data.features[:, column_id])
+                has_nan = pd_column.isna()
+                replaced_column, gap_ids = replace_nans_with_fedot_nans(pd_column, has_nan)
 
                 # Convert into integers
-                converted_column = self._apply_encoder(column, column_id, gap_ids)
+                converted_column = self._apply_encoder(replaced_column, column_id, gap_ids)
             else:
                 # Stay column the same
-                converted_column = np.array(input_data.features[:, column_id])
+                converted_column = input_data.features[:, column_id]
 
             converted_features.append(converted_column.reshape((-1, 1)))
 
@@ -110,7 +108,7 @@ def fit_transform(self, input_data: InputData) -> InputData:
         self.fit(input_data)
         return self.transform(input_data)
 
-    def _train_encoder(self, column: np.array, column_id: int):
+    def _train_encoder(self, column: pd.Series, column_id: int):
         """ Convert labels in the column from string into int via Label encoding.
         So, Label encoder is fitted to do such transformation.
         """
@@ -120,18 +118,11 @@ def _train_encoder(self, column: np.array, column_id: int):
         # Store fitted label encoder for transform method
         self.binary_encoders.update({column_id: encoder})
 
-    def _apply_encoder(self, column: np.array, column_id: int, gap_ids: np.array) -> np.array:
+    def _apply_encoder(self, column: pd.Series, column_id: int, gap_ids: pd.Series) -> np.ndarray:
         """ Apply already fitted encoders """
         encoder = self.binary_encoders[column_id]
-        encoder_classes = list(encoder.classes_)
-
-        # If the column contains categories not previously encountered
-        for label in list(set(column)):
-            if label not in encoder_classes:
-                encoder_classes.append(label)
-
-        # Extent encoder classes
-        encoder.classes_ = np.array(encoder_classes)
+        # Extend encoder classes if the column contains categories not previously encountered
+        encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))
 
         converted = encoder.transform(column)
         if len(gap_ids) > 0:
@@ -142,11 +133,7 @@ def _apply_encoder(self, column: np.array, column_id: int, gap_ids: np.array) ->
         return converted
 
 
-def replace_nans_with_fedot_nans(column: np.array, is_row_has_nan):
-    # There are nans in the columns - find indices of such objects
-    # True > 0
-    gap_ids = np.ravel(np.argwhere(is_row_has_nan.values > 0))
-
+def replace_nans_with_fedot_nans(column: pd.Series, has_nan: pd.Series) -> Tuple[pd.Series, pd.Series]:
     # Add new category - 'fedot_nan' after converting it will be replaced by nans
-    column[gap_ids] = FEDOT_STR_NAN
-    return column, gap_ids
+    column[has_nan] = FEDOT_STR_NAN
+    return column, has_nan

From fcabc4df905337ba44b179701c38a5f199af8bc5 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Fri, 25 Nov 2022 17:12:27 +0300
Subject: [PATCH 07/72] fixed replacing inf with nan

---
 fedot/core/data/data_preprocessing.py | 10 +++-------
 fedot/preprocessing/categorical.py    |  3 +--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 2bdba0c1ce..9c2ae5fa83 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -13,13 +13,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
 
 def replace_inf_with_nans(input_data: InputData):
     features = input_data.features
-    if features.dtype == object:
-        print(features[:2])
-    try:
-        features[(features == np.inf) | (features == -np.inf)] = np.nan
-    except Exception as exc:
-        print("PROBLEM DTYPE", features.dtype, exc, features)
-        raise
+    has_infs = (features == np.inf) | (features == -np.inf)
+    if np.any(has_infs):
+        features[has_infs] = np.nan
 
 
 def replace_nans_with_empty_strings(input_data: InputData):
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 1d7ecfc420..d75541f989 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -1,9 +1,8 @@
 from copy import deepcopy
+from typing import Tuple
 
 import numpy as np
 import pandas as pd
-from typing import Tuple
-
 from sklearn.preprocessing import LabelEncoder
 
 from fedot.core.data.data import InputData

From cf9447a998932e1a765e40a1d3cd4d318a2a5b9d Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 28 Nov 2022 13:32:19 +0300
Subject: [PATCH 08/72] label encoder same refactoring

---
 .../data_operations/categorical_encoders.py   | 30 +++++++------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 055655f0f8..2adfdf77a9 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -1,13 +1,16 @@
 from copy import deepcopy
-from typing import Optional, Union
+from typing import Optional
 
 import numpy as np
-from sklearn.preprocessing import OneHotEncoder, LabelEncoder
+import pandas as pd
+
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
 from fedot.core.data.data import InputData, OutputData
 from fedot.core.data.data_preprocessing import find_categorical_columns
-from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
+from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import (
     DataOperationImplementation
+)
 from fedot.core.operations.operation_parameters import OperationParameters
 
 
@@ -85,7 +88,7 @@ def _update_column_types(self, output_data: OutputData):
             output_data.encoded_idx = self.encoded_ids
             output_data.supplementary_data.column_types['features'] = numerical_columns
 
-    def _apply_one_hot_encoding(self, features: np.array) -> np.array:
+    def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray:
         """
         The method creates a table based on categorical and real features after One Hot Encoding transformation
 
@@ -139,10 +142,7 @@ def transform(self, input_data: InputData) -> OutputData:
             # If categorical features are exists - transform them inplace in InputData
             for categorical_id in self.categorical_ids:
                 categorical_column = input_data.features[:, categorical_id]
-
-                # Converting into string - so nans becomes marked as 'nan'
-                categorical_column = categorical_column.astype(str)
-                gap_ids = np.ravel(np.argwhere(categorical_column == 'nan'))
+                gap_ids = pd.isna(categorical_column)
 
                 transformed = self._apply_label_encoder(categorical_column, categorical_id, gap_ids)
                 copied_data.features[:, categorical_id] = transformed
@@ -172,8 +172,8 @@ def _fit_label_encoders(self, input_data: InputData):
 
             self.encoders.update({categorical_id: le})
 
-    def _apply_label_encoder(self, categorical_column: np.array, categorical_id: int,
-                             gap_ids: Union[np.array, None]) -> np.array:
+    def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int,
+                             gap_ids: np.ndarray) -> np.ndarray:
         """ Apply fitted LabelEncoder for column transformation
 
         :param categorical_column: numpy array with categorical features
@@ -181,15 +181,7 @@ def _apply_label_encoder(self, categorical_column: np.array, categorical_id: int
         :param gap_ids: indices of gap elements in array
         """
         column_encoder = self.encoders[categorical_id]
-        encoder_classes = list(column_encoder.classes_)
-
-        # If the column contains categories not previously encountered
-        for label in sorted(list(set(categorical_column))):
-            if label not in encoder_classes:
-                encoder_classes.append(label)
-
-        # Extent encoder classes
-        column_encoder.classes_ = np.array(encoder_classes)
+        column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, categorical_column)))
 
         transformed_column = column_encoder.transform(categorical_column)
         if len(gap_ids) > 0:

From 7431c986c73214f2f0676c05df7c91fdb9e4e013 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 28 Nov 2022 21:36:29 +0300
Subject: [PATCH 09/72] logical fix in label encoder

---
 .../data_operations/categorical_encoders.py                   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 2adfdf77a9..641c22c69d 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -142,12 +142,12 @@ def transform(self, input_data: InputData) -> OutputData:
             # If categorical features are exists - transform them inplace in InputData
             for categorical_id in self.categorical_ids:
                 categorical_column = input_data.features[:, categorical_id]
-                gap_ids = pd.isna(categorical_column)
+                gap_ids: np.ndarray = pd.isna(categorical_column)
 
                 transformed = self._apply_label_encoder(categorical_column, categorical_id, gap_ids)
                 copied_data.features[:, categorical_id] = transformed
 
-        output_data = self._convert_to_output(input_data,
+        output_data = self._convert_to_output(copied_data,
                                               copied_data.features)
 
         self._update_column_types(output_data)

From a8b9c90fa2917646eb2688c2dc62c59c7a9c5e98 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 30 Nov 2022 15:05:05 +0300
Subject: [PATCH 10/72] nans with cats in unique func fix

---
 .../data_operations/categorical_encoders.py                     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 641c22c69d..3ed53f2fbb 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -181,7 +181,7 @@ def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: i
         :param gap_ids: indices of gap elements in array
         """
         column_encoder = self.encoders[categorical_id]
-        column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, categorical_column)))
+        column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column)))
 
         transformed_column = column_encoder.transform(categorical_column)
         if len(gap_ids) > 0:

From 243df8e6ef89ffd0f4422e9150d14a043010f110 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 6 Dec 2022 11:27:07 +0300
Subject: [PATCH 11/72] types fixes

---
 fedot/core/data/data.py           |  2 ++
 fedot/preprocessing/data_types.py | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
index fc071d5a99..0247638e96 100644
--- a/fedot/core/data/data.py
+++ b/fedot/core/data/data.py
@@ -2,12 +2,14 @@
 
 import glob
 import os
+
 from copy import copy, deepcopy
 from dataclasses import dataclass, field
 from typing import Any, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
+
 from golem.core.log import default_log
 from golem.utilities.requirements_notificator import warn_requirement
 
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 4ccb62b4ec..6d48c28191 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -110,7 +110,7 @@ def convert_data_for_predict(self, data: InputData):
         self._retain_columns_info_without_types_conflicts(data)
         return data
 
-    def remove_incorrect_features(self, table: np.array, converted_columns: dict):
+    def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
         """
         Remove from the table columns with conflicts with types were not resolved
 
@@ -130,7 +130,7 @@ def remove_incorrect_features(self, table: np.array, converted_columns: dict):
         table = np.delete(table, self.columns_to_del, 1)
         return table
 
-    def features_types_converting(self, features: np.array) -> np.array:
+    def features_types_converting(self, features: np.ndarray) -> np.array:
         """ Convert all elements in the data in every feature column into one type
 
         :param features: tabular features array
@@ -157,7 +157,7 @@ def features_types_converting(self, features: np.array) -> np.array:
 
         return features
 
-    def target_types_converting(self, target: np.array, task: Task) -> np.array:
+    def target_types_converting(self, target: np.ndarray, task: Task) -> np.array:
         """ Convert all elements in every target column into one type
 
         :param target: tabular target array
@@ -185,7 +185,7 @@ def target_types_converting(self, target: np.array, task: Task) -> np.array:
 
         return target
 
-    def prepare_column_types_info(self, predictors: np.array, target: np.array = None,
+    def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = None,
                                   task: Task = None) -> dict:
         """ Prepare information about columns in a form of dictionary
         Dictionary has two keys: 'target' and 'features'
@@ -224,7 +224,7 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
                     remained_column_types.append(col)
             data.supplementary_data.column_types['features'] = remained_column_types
 
-    def _check_columns_vs_types_number(self, table: np.array, column_types: list):
+    def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list):
         # Check if columns number correct
         n_rows, n_cols = table.shape
         if n_cols != len(column_types):
@@ -244,7 +244,7 @@ def _remove_pseudo_str_values_from_str_column(data: pd.DataFrame, column_id: int
                 converted_column.append(cur_column[i])
         data.features[:, column_id] = pd.Series(converted_column).values
 
-    def _convert_feature_into_one_type(self, mixed_column: np.array, column_info: dict, mixed_column_id: int):
+    def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int):
         """ Determine new type for current feature column based on the string ratio. And then convert column into it.
 
         :param mixed_column: one-dimensional array with several data types
@@ -280,7 +280,7 @@ def _convert_feature_into_one_type(self, mixed_column: np.array, column_info: di
             self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
             return None, 'removed'
 
-    def _convert_target_into_one_type(self, mixed_column: np.array, column_info: dict, mixed_column_id: int,
+    def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int,
                                       task: Task) -> Tuple[np.ndarray, str]:
         """ Convert target columns into one type based on column proportions of object and task """
         if task.task_type is TaskTypesEnum.classification:
@@ -424,7 +424,7 @@ def to_type(item):
     if table is None:
         return {}
 
-    n_rows, n_columns = table.shape
+    _, n_columns = table.shape
     columns_info = {}
     for column_id in range(n_columns):
         current_column = table[:, column_id]
@@ -438,7 +438,7 @@ def to_type(item):
             float_number = (column_types == NAME_CLASS_FLOAT).sum()
 
             # Store information about nans in the target
-            nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE))  # TODO: maybe just convert to list to preserve idx pairs?
+            nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE))
             columns_info.update({column_id: {'types': unique_column_types,
                                              'str_number': str_number,
                                              'int_number': int_number,
@@ -462,7 +462,7 @@ def find_mixed_types_columns(columns_info: dict):
     return columns_with_mixed_types
 
 
-def apply_type_transformation(table: np.array, column_types: list, log: LoggerAdapter):
+def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter):
     """
     Apply transformation for columns in dataset into desired type. Perform
     transformation on predict stage when column types were already determined
@@ -510,7 +510,7 @@ def _obtain_new_column_type(column_info):
         return int
 
 
-def _convert_predict_column_into_desired_type(table: np.array, current_column: np.array,
+def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray,
                                               column_id: int, current_type, log: LoggerAdapter):
     try:
         table[:, column_id] = current_column.astype(current_type)

From b6d5e77311ef8dce139c2a7f71a976748182016b Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 12 Dec 2022 16:03:59 +0300
Subject: [PATCH 12/72] minor improvements

---
 fedot/core/data/data_preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 9c2ae5fa83..c0b411a73b 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -102,7 +102,7 @@ def force_categorical_determination(table: np.ndarray):
 
 def data_has_missing_values(data: InputData) -> bool:
     """ Check data for missing values."""
-    return data_type_is_suitable_for_preprocessing(data) and pd.DataFrame(data.features).isna().sum().sum() > 0
+    return data_type_is_suitable_for_preprocessing(data) and pd.DataFrame(data.features).isna().to_numpy().sum() > 0
 
 
 def data_has_categorical_features(data: InputData) -> bool:

From 21f4ce495878469c18eb635ebfe1af95046bc20a Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 14 Dec 2022 15:17:52 +0300
Subject: [PATCH 13/72] minor conversation fix from PR

---
 fedot/core/data/data_preprocessing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index c0b411a73b..6272463009 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -1,3 +1,5 @@
+from typing import Tuple, Optional
+
 import numpy as np
 import pandas as pd
 
@@ -132,6 +134,4 @@ def data_has_text_features(data: InputData) -> bool:
     Returns bool, whether data has text fields or not
     """
     # TODO andreygetmanov: make compatible with current text checking
-    if data.data_type is DataTypesEnum.text:
-        return True
-    return False
+    return data.data_type is DataTypesEnum.text

From 001d8b18a4099b502627e3953559e10344e50585 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 15 Dec 2022 13:59:23 +0300
Subject: [PATCH 14/72] fix format + rename semantically

---
 fedot/core/data/data_preprocessing.py                |  2 --
 .../data_operations/categorical_encoders.py          | 12 ++++++------
 .../implementation_interfaces.py                     |  2 +-
 fedot/preprocessing/categorical.py                   | 10 +++++-----
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 6272463009..5d25b1463d 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pandas as pd
 
-from typing import Tuple, Optional
-
 from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
 from fedot.core.repository.dataset_types import DataTypesEnum
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 3ed53f2fbb..fa09cb1b23 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -142,9 +142,9 @@ def transform(self, input_data: InputData) -> OutputData:
             # If categorical features are exists - transform them inplace in InputData
             for categorical_id in self.categorical_ids:
                 categorical_column = input_data.features[:, categorical_id]
-                gap_ids: np.ndarray = pd.isna(categorical_column)
+                has_nan: np.ndarray = pd.isna(categorical_column)
 
-                transformed = self._apply_label_encoder(categorical_column, categorical_id, gap_ids)
+                transformed = self._apply_label_encoder(categorical_column, categorical_id, has_nan)
                 copied_data.features[:, categorical_id] = transformed
 
         output_data = self._convert_to_output(copied_data,
@@ -173,21 +173,21 @@ def _fit_label_encoders(self, input_data: InputData):
             self.encoders.update({categorical_id: le})
 
     def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int,
-                             gap_ids: np.ndarray) -> np.ndarray:
+                             has_nan: np.ndarray) -> np.ndarray:
         """ Apply fitted LabelEncoder for column transformation
 
         :param categorical_column: numpy array with categorical features
         :param categorical_id: index of current categorical column
-        :param gap_ids: indices of gap elements in array
+        :param has_nan: bool array of gap elements in the ``categorical_column``
         """
         column_encoder = self.encoders[categorical_id]
         column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column)))
 
         transformed_column = column_encoder.transform(categorical_column)
-        if len(gap_ids) > 0:
+        if len(has_nan) > 0:
             # Store np.nan values
             transformed_column = transformed_column.astype(object)
-            transformed_column[gap_ids] = np.nan
+            transformed_column[has_nan] = np.nan
 
         return transformed_column
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py
index f092fe671b..6e4703a6a5 100644
--- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py
+++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py
@@ -219,7 +219,7 @@ def _convert_to_output(input_data: InputData, predict: np.array,
         return converted
 
 
-def _convert_to_output_function(input_data: InputData, transformed_features: np.array,
+def _convert_to_output_function(input_data: InputData, transformed_features: np.ndarray,
                                 data_type: DataTypesEnum = DataTypesEnum.table):
     """ Function prepare prediction of operation as OutputData object
 
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index d75541f989..e31937c8f6 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -74,10 +74,10 @@ def transform(self, input_data: InputData) -> InputData:
                 # If column contains nans - replace them with fedot nans special string
                 pd_column = pd.Series(input_data.features[:, column_id])
                 has_nan = pd_column.isna()
-                replaced_column, gap_ids = replace_nans_with_fedot_nans(pd_column, has_nan)
+                replaced_column, has_nan = replace_nans_with_fedot_nans(pd_column, has_nan)
 
                 # Convert into integers
-                converted_column = self._apply_encoder(replaced_column, column_id, gap_ids)
+                converted_column = self._apply_encoder(replaced_column, column_id, has_nan)
             else:
                 # Stay column the same
                 converted_column = input_data.features[:, column_id]
@@ -117,17 +117,17 @@ def _train_encoder(self, column: pd.Series, column_id: int):
         # Store fitted label encoder for transform method
         self.binary_encoders.update({column_id: encoder})
 
-    def _apply_encoder(self, column: pd.Series, column_id: int, gap_ids: pd.Series) -> np.ndarray:
+    def _apply_encoder(self, column: pd.Series, column_id: int, has_nan: pd.Series) -> np.ndarray:
         """ Apply already fitted encoders """
         encoder = self.binary_encoders[column_id]
         # Extend encoder classes if the column contains categories not previously encountered
         encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))
 
         converted = encoder.transform(column)
-        if len(gap_ids) > 0:
+        if len(has_nan) > 0:
             # Column has nans in its structure - after conversion replace it
             converted = converted.astype(float)
-            converted[gap_ids] = np.nan
+            converted[has_nan] = np.nan
 
         return converted
 

From 267f704c2f0249776e93bb06585719a5b4604bbd Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 29 Dec 2022 15:28:53 +0300
Subject: [PATCH 15/72] PR fixes

* rid of str variables for types in preprocessor
* improved define column_types function in data_types.py
---
 fedot/core/data/data_preprocessing.py         |   9 +-
 .../data_operations/categorical_encoders.py   |  11 +-
 .../sklearn_transformations.py                |   8 +-
 .../data_operations/ts_transformations.py     |   5 +-
 fedot/core/operations/model.py                | 137 +++++++++---------
 fedot/preprocessing/categorical.py            |   4 +-
 fedot/preprocessing/data_types.py             | 115 ++++++++-------
 fedot/preprocessing/preprocessing.py          |   7 +-
 test/unit/data/test_supplementary_data.py     |  12 +-
 .../test_data_operations_implementations.py   |  27 ++--
 .../test_preprocessing_through_api.py         |   4 +-
 test/unit/preprocessing/test_preprocessors.py |  13 +-
 12 files changed, 184 insertions(+), 168 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 5d25b1463d..9f4455d87b 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -5,6 +5,7 @@
 
 from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
 from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
@@ -76,11 +77,11 @@ def find_categorical_columns(table: np.ndarray, column_types: dict = None):
 
     categorical_ids = []
     non_categorical_ids = []
-    for column_id, type_name in enumerate(column_types):
-        if 'str' in str(type_name):
-            categorical_ids.append(column_id)
+    for col_id, col_type_id in enumerate(column_types):
+        if col_type_id == TYPE_TO_ID[str]:
+            categorical_ids.append(col_id)
         else:
-            non_categorical_ids.append(column_id)
+            non_categorical_ids.append(col_id)
 
     return categorical_ids, non_categorical_ids
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index fa09cb1b23..58ee267622 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -12,6 +12,7 @@
     DataOperationImplementation
 )
 from fedot.core.operations.operation_parameters import OperationParameters
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 class OneHotEncodingImplementation(DataOperationImplementation):
@@ -35,9 +36,9 @@ def fit(self, input_data: InputData):
         :return encoder: trained encoder (optional output)
         """
         features = input_data.features
-        features_types = input_data.supplementary_data.column_types.get('features')
+        features_type_ids = input_data.supplementary_data.column_types.get('features')
         categorical_ids, non_categorical_ids = find_categorical_columns(features,
-                                                                        features_types)
+                                                                        features_type_ids)
 
         # Indices of columns with categorical and non-categorical features
         self.categorical_ids = categorical_ids
@@ -79,11 +80,11 @@ def _update_column_types(self, output_data: OutputData):
         if self.categorical_ids:
             # There are categorical features in the table
             col_types = output_data.supplementary_data.column_types['features']
-            numerical_columns = [t_name for t_name in col_types if 'str' not in t_name]
+            numerical_columns = [t_name for t_name in col_types if t_name != TYPE_TO_ID[str]]
 
             # Calculate new binary columns number after encoding
             encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
-            numerical_columns.extend([str(int)] * encoded_columns_number)
+            numerical_columns.extend([TYPE_TO_ID[int]] * encoded_columns_number)
 
             output_data.encoded_idx = self.encoded_ids
             output_data.supplementary_data.column_types['features'] = numerical_columns
@@ -159,7 +160,7 @@ def _update_column_types(self, output_data: OutputData):
             # Categorical features were in the dataset
             col_types = output_data.supplementary_data.column_types['features']
             for categorical_id in self.categorical_ids:
-                col_types[categorical_id] = str(int)
+                col_types[categorical_id] = TYPE_TO_ID[int]
 
             output_data.supplementary_data.column_types['features'] = col_types
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index b75e70076c..3b7e4b49b3 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -13,6 +13,7 @@
 from fedot.core.operations.evaluation.operation_implementations. \
     implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 class ComponentAnalysisImplementation(DataOperationImplementation):
@@ -87,8 +88,8 @@ def update_column_types(output_data: OutputData) -> OutputData:
         """Update column types after applying PCA operations
         """
 
-        n_rows, n_cols = output_data.predict.shape
-        output_data.supplementary_data.column_types['features'] = [str(float) * n_cols]
+        _, n_cols = output_data.predict.shape
+        output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float] * n_cols]
         return output_data
 
 
@@ -127,6 +128,7 @@ class FastICAImplementation(ComponentAnalysisImplementation):
     Args:
         params: OperationParameters with the hyperparameters
     """
+
     def __init__(self, params: Optional[OperationParameters]):
         super().__init__(params)
         self.pca = FastICA(**self.params.to_dict())
@@ -195,7 +197,7 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             if cols_number_added > 0:
                 # There are new columns in the table
                 col_types = output_data.supplementary_data.column_types['features']
-                col_types.extend([str(float)] * cols_number_added)
+                col_types.extend([TYPE_TO_ID[float]] * cols_number_added)
                 output_data.supplementary_data.column_types['features'] = col_types
 
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
index 6910bf0a30..74dd395fdb 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
@@ -13,6 +13,7 @@
 )
 from fedot.core.operations.operation_parameters import OperationParameters
 from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 class LaggedImplementation(DataOperationImplementation):
@@ -127,12 +128,12 @@ def _update_column_types(self, output_data: OutputData):
         """
 
         features_n_rows, features_n_cols = output_data.predict.shape
-        features_column_types = [str(float)] * features_n_cols
+        features_column_types = [TYPE_TO_ID[float]] * features_n_cols
         column_types = {'features': features_column_types}
 
         if output_data.target is not None and len(output_data.target.shape) > 1:
             target_n_rows, target_n_cols = output_data.target.shape
-            column_types.update({'target': [str(float)] * target_n_cols})
+            column_types.update({'target': [TYPE_TO_ID[float]] * target_n_cols})
         output_data.supplementary_data.column_types = column_types
 
     def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array,
diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py
index 0d435a3ce0..8ecd309fd8 100644
--- a/fedot/core/operations/model.py
+++ b/fedot/core/operations/model.py
@@ -1,68 +1,69 @@
-import numpy as np
-
-from fedot.core.data.data import OutputData
-from fedot.core.operations.operation import Operation
-from fedot.core.repository.dataset_types import DataTypesEnum
-from fedot.core.repository.operation_types_repository import OperationTypesRepository
-from fedot.core.repository.tasks import TaskTypesEnum
-
-
-class Model(Operation):
-    """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
-
-    Args:
-        operation_type: name of the model
-    """
-
-    def __init__(self, operation_type: str):
-        super().__init__(operation_type=operation_type)
-        self.operations_repo = OperationTypesRepository('model')
-
-    @staticmethod
-    def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
-        """Assign types for tabular data obtained from model predictions.\n
-        By default, all types of model predictions for tabular data can be clearly defined
-        """
-        if output_data.data_type is not DataTypesEnum.table:
-            # No column data types info for non-tabular data
-            return output_data
-
-        is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
-        is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting
-
-        predict_shape = np.array(output_data.predict).shape
-        # Add information about features
-        if is_regression_task or is_ts_forecasting_task:
-            if len(predict_shape) < 2:
-                column_info = {'features': [str(float)] * predict_shape[0]}
-            else:
-                column_info = {'features': [str(float)] * predict_shape[1]}
-        else:
-            if len(predict_shape) < 2:
-                output_data.predict = output_data.predict.reshape((-1, 1))
-                predict_shape = output_data.predict.shape
-            # Classification task or clustering
-            if output_mode == 'labels':
-                column_info = {'features': [str(int)] * predict_shape[1]}
-            else:
-                column_info = {'features': [str(float)] * predict_shape[1]}
-
-        # Add information about target
-        target_shape = output_data.target.shape if output_data.target is not None else None
-        if target_shape is None:
-            # There is no target column in output data
-            output_data.supplementary_data.column_types = column_info
-            return output_data
-
-        if is_regression_task or is_ts_forecasting_task:
-            if len(target_shape) > 1:
-                column_info.update({'target': [str(float)] * target_shape[1]})
-            else:
-                # Array present "time series"
-                column_info.update({'target': [str(float)] * len(output_data.target)})
-        else:
-            # Classification task or clustering
-            column_info.update({'target': [str(int)] * predict_shape[1]})
-
-        output_data.supplementary_data.column_types = column_info
-        return output_data
+import numpy as np
+
+from fedot.core.data.data import OutputData
+from fedot.core.operations.operation import Operation
+from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.core.repository.operation_types_repository import OperationTypesRepository
+from fedot.core.repository.tasks import TaskTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
+
+
+class Model(Operation):
+    """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
+
+    Args:
+        operation_type: name of the model
+    """
+
+    def __init__(self, operation_type: str):
+        super().__init__(operation_type=operation_type)
+        self.operations_repo = OperationTypesRepository('model')
+
+    @staticmethod
+    def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
+        """Assign types for tabular data obtained from model predictions.\n
+        By default, all types of model predictions for tabular data can be clearly defined
+        """
+        if output_data.data_type is not DataTypesEnum.table:
+            # No column data types info for non-tabular data
+            return output_data
+
+        is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
+        is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting
+
+        predict_shape = np.array(output_data.predict).shape
+        # Add information about features
+        if is_regression_task or is_ts_forecasting_task:
+            if len(predict_shape) < 2:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
+            else:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+        else:
+            if len(predict_shape) < 2:
+                output_data.predict = output_data.predict.reshape((-1, 1))
+                predict_shape = output_data.predict.shape
+            # Classification task or clustering
+            if output_mode == 'labels':
+                column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]}
+            else:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+
+        # Add information about target
+        target_shape = output_data.target.shape if output_data.target is not None else None
+        if target_shape is None:
+            # There is no target column in output data
+            output_data.supplementary_data.column_types = column_info
+            return output_data
+
+        if is_regression_task or is_ts_forecasting_task:
+            if len(target_shape) > 1:
+                column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]})
+            else:
+                # Array present "time series"
+                column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)})
+        else:
+            # Classification task or clustering
+            column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]})
+
+        output_data.supplementary_data.column_types = column_info
+        return output_data
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index e31937c8f6..09368286c5 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -7,7 +7,7 @@
 
 from fedot.core.data.data import InputData
 from fedot.core.data.data_preprocessing import find_categorical_columns
-from fedot.preprocessing.data_types import NAME_CLASS_INT, FEDOT_STR_NAN
+from fedot.preprocessing.data_types import TYPE_TO_ID, FEDOT_STR_NAN
 
 
 class BinaryCategoricalPreprocessor:
@@ -91,7 +91,7 @@ def transform(self, input_data: InputData) -> InputData:
         # Update features types
         features_types = copied_data.supplementary_data.column_types['features']
         for converted_column_id in self.binary_ids_to_convert:
-            features_types[converted_column_id] = NAME_CLASS_INT
+            features_types[converted_column_id] = TYPE_TO_ID[int]
         return copied_data
 
     def fit_transform(self, input_data: InputData) -> InputData:
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 6d48c28191..e233fa88a5 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -14,10 +14,11 @@
 if TYPE_CHECKING:
     from fedot.core.data.data import InputData
 
-NAME_CLASS_STR = "<class 'str'>"
-NAME_CLASS_INT = "<class 'int'>"
-NAME_CLASS_FLOAT = "<class 'float'>"
-NAME_CLASS_NONE = "<class 'NoneType'>"
+_convertable_types = (bool, float, int, str, type(None))
+_types_ids = range(len(_convertable_types))
+
+TYPE_TO_ID = dict(zip(_convertable_types, _types_ids))
+
 FEDOT_STR_NAN = 'fedot_nan'
 # If unique values in the feature column is less than 13 - convert column into string type else to numerical
 CATEGORICAL_MAX_UNIQUE_TH = 13
@@ -120,8 +121,7 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
         if not converted_columns:
             return table
 
-        self.columns_to_del = [column_id for column_id, new_type_name in converted_columns.items() if
-                               new_type_name == 'removed']
+        self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id == -1]
         if not self.columns_to_del:
             # There are no columns to delete
             return table
@@ -218,15 +218,15 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
 
             data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
 
-            remained_column_types = []
-            for i, col in enumerate(data.supplementary_data.column_types['features']):
-                if i not in self.string_columns_transformation_failed:
-                    remained_column_types.append(col)
-            data.supplementary_data.column_types['features'] = remained_column_types
+            data.supplementary_data.column_types['features'] = [
+                col_type
+                for col_id, col_type in enumerate(data.supplementary_data.column_types['features'])
+                if col_id not in self.string_columns_transformation_failed
+            ]
 
     def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list):
         # Check if columns number correct
-        n_rows, n_cols = table.shape
+        _, n_cols = table.shape
         if n_cols != len(column_types):
             # There is an incorrect types calculation
             self.log.warning('Columns number and types numbers do not match.')
@@ -251,9 +251,9 @@ def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info:
         :param column_info: dictionary with information about types in the column
         :param mixed_column_id: index of column in dataset
         """
-        if len(column_info['types']) == 2 and NAME_CLASS_NONE in column_info['types']:
+        if len(column_info['types']) == 2 and TYPE_TO_ID[type(None)] in column_info['types']:
             # Column contain only one data type and nans
-            filtered_types = [x for x in column_info['types'] if x != NAME_CLASS_NONE]
+            filtered_types = [x for x in column_info['types'] if x != TYPE_TO_ID[type(None)]]
             return mixed_column, filtered_types[0]
 
         string_objects_number = column_info['str_number']
@@ -272,13 +272,13 @@ def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info:
                 mixed_column = mixed_column.astype(object)
                 mixed_column[column_info['nan_ids']] = np.nan
                 del column_info['nan_ids']
-            return mixed_column, str(suggested_type)
+            return mixed_column, TYPE_TO_ID[suggested_type]
         except ValueError:
             # Cannot convert string objects into int or float (for example 'a' into int)
             prefix = f'Feature column with index {mixed_column_id} contains ' \
                      f'following data types: {column_info["types"]}.'
             self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
-            return None, 'removed'
+            return None, -1
 
     def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int,
                                       task: Task) -> Tuple[np.ndarray, str]:
@@ -291,7 +291,7 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: d
 
         try:
             mixed_column = mixed_column.astype(suggested_type)
-            return mixed_column, str(suggested_type)
+            return mixed_column, TYPE_TO_ID[suggested_type]
         except ValueError:
             # Cannot convert string objects into int or float (for example 'a' into int)
             target_column = pd.Series(mixed_column)
@@ -302,7 +302,7 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: d
             log_message = f'{prefix} String cannot be converted into {suggested_type}. Ignore non converted values.'
             self.log.debug(log_message)
             self.target_converting_has_errors = True
-            return converted_column.values, str(suggested_type)
+            return converted_column.values, TYPE_TO_ID[suggested_type]
 
     def _into_categorical_features_transformation_for_fit(self, data: InputData):
         """
@@ -313,7 +313,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         for column_id in range(n_cols):
             # For every int/float column perform check
             column_type = data.supplementary_data.column_types['features'][column_id]
-            if 'int' in column_type or 'float' in column_type:
+            if column_type in [TYPE_TO_ID[int], TYPE_TO_ID[float]]:
                 numerical_column = pd.Series(data.features[:, column_id])
 
                 # Calculate number of unique values except nans
@@ -331,7 +331,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
 
                     # Update information about column types (in-place)
                     features_types = data.supplementary_data.column_types['features']
-                    features_types[column_id] = NAME_CLASS_STR
+                    features_types[column_id] = TYPE_TO_ID[str]
 
     def _into_categorical_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into categorical string column for every signed column """
@@ -349,7 +349,7 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
 
                 # Update information about column types (in-place)
                 features_types = data.supplementary_data.column_types['features']
-                features_types[column_id] = NAME_CLASS_STR
+                features_types[column_id] = TYPE_TO_ID[str]
 
     def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
@@ -359,7 +359,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         for column_id in range(n_cols):
             # For every string column perform converting if necessary
             column_type = data.supplementary_data.column_types['features'][column_id]
-            if 'str' in column_type:
+            if column_type == TYPE_TO_ID[str]:
                 string_column = pd.Series(data.features[:, column_id])
 
                 # Number of nans in the column
@@ -382,7 +382,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
                     # Update information about column types (in-place)
                     self.categorical_into_float.append(column_id)
                     features_types = data.supplementary_data.column_types['features']
-                    features_types[column_id] = NAME_CLASS_FLOAT
+                    features_types[column_id] = TYPE_TO_ID[float]
                 elif failed_ratio >= self.acceptable_failed_rate_top \
                         and is_column_contain_numerical_objects:
                     # The column consists mostly of truly str values and has a few ints/floats in it
@@ -390,7 +390,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
                 elif self.acceptable_failed_rate_top > failed_ratio >= self.acceptable_failed_rate_bottom:
                     # Probably numerical column contains a lot of '?' or 'x' as nans equivalents
                     # Add columns to remove list
-                    self.string_columns_transformation_failed.update({column_id: 'removed'})
+                    self.string_columns_transformation_failed.update({column_id: -1})
 
     def _into_numeric_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into float string column for every signed column """
@@ -409,7 +409,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
 
                 # Update information about column types (in-place)
                 features_types = data.supplementary_data.column_types['features']
-                features_types[column_id] = NAME_CLASS_FLOAT
+                features_types[column_id] = TYPE_TO_ID[float]
 
 
 def define_column_types(table: np.ndarray):
@@ -417,29 +417,38 @@ def define_column_types(table: np.ndarray):
     types, which column contains. If column with mixed type contain str object
     additional field 'str_ids' with indices of string objects is prepared
     """
-    def to_type(item):
-        return str(type(item))
-    vto_type = np.vectorize(to_type)
-
     if table is None:
         return {}
 
     _, n_columns = table.shape
+
+    nans = pd.isna(table)
+    table_of_types = np.empty_like(table, dtype=np.int8)
+    table_of_types[~nans] = [
+        TYPE_TO_ID[type(x.item() if getattr(x, 'item', False) else x)]
+        for x in table[~nans]
+    ]
+    table_of_types[nans] = TYPE_TO_ID[type(None)]
+
     columns_info = {}
     for column_id in range(n_columns):
-        current_column = table[:, column_id]
+        col_types = table_of_types[:, column_id]
 
-        column_types = np.where(pd.isna(current_column), str(type(None)), vto_type(current_column))
-        unique_column_types = np.unique(column_types)
+        unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True)
 
-        if len(unique_column_types) > 1:
-            str_number = (column_types == NAME_CLASS_STR).sum()
-            int_number = (column_types == NAME_CLASS_INT).sum()
-            float_number = (column_types == NAME_CLASS_FLOAT).sum()
+        if len(unique_col_types) > 1:
+            numbers = [
+                unique_col_types_number[unique_col_types == TYPE_TO_ID[t]]
+                for t in [str, int, float]
+            ]
+            str_number, int_number, float_number = [
+                number.item() if len(number) else 0
+                for number in numbers
+            ]
 
             # Store information about nans in the target
-            nan_ids = np.ravel(np.argwhere(column_types == NAME_CLASS_NONE))
-            columns_info.update({column_id: {'types': unique_column_types,
+            nan_ids = np.ravel(np.argwhere(col_types == TYPE_TO_ID[type(None)]))
+            columns_info.update({column_id: {'types': unique_col_types,
                                              'str_number': str_number,
                                              'int_number': int_number,
                                              'float_number': float_number,
@@ -447,7 +456,7 @@ def to_type(item):
                                              'nan_ids': nan_ids}})
         else:
             # There is only one type, or several types such as int and float
-            columns_info.update({column_id: {'types': unique_column_types}})
+            columns_info.update({column_id: {'types': unique_col_types}})
     return columns_info
 
 
@@ -469,11 +478,11 @@ def apply_type_transformation(table: np.ndarray, column_types: list, log: Logger
     during fit
     """
 
-    def type_by_name(current_type_name: str):
-        """ Return type by its name """
-        if 'int' in current_type_name:
+    def type_by_id(current_type_id: int):
+        """ Return type by its ID """
+        if current_type_id == TYPE_TO_ID[int]:
             return int
-        elif 'str' in current_type_name:
+        elif current_type_id == TYPE_TO_ID[str]:
             return str
         else:
             return float
@@ -485,7 +494,7 @@ def type_by_name(current_type_name: str):
     n_rows, n_cols = table.shape
     for column_id in range(n_cols):
         current_column = table[:, column_id]
-        current_type = type_by_name(column_types[column_id])
+        current_type = type_by_id(column_types[column_id])
         _convert_predict_column_into_desired_type(table=table, current_column=current_column, current_type=current_type,
                                                   column_id=column_id, log=log)
 
@@ -500,7 +509,7 @@ def convert_num_column_into_string_array(numerical_column: pd.Series) -> np.arra
     return numerical_column.to_numpy()
 
 
-def _obtain_new_column_type(column_info):
+def _obtain_new_column_type(column_info: dict):
     """ Suggest in or float type based on the presence of nan and float values """
     if column_info['float_number'] > 0 or column_info['nan_number'] > 0:
         # Even if one of types are float - all elements should be converted into float
@@ -534,24 +543,24 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict)
     """
     updated_column_types = []
     for column_id, column_info in columns_types_info.items():
-        column_types = column_info['types']
+        column_type_ids = column_info['types']
 
-        if len(column_types) == 1:
+        if len(column_type_ids) == 1:
             # Column initially contain only one type
-            updated_column_types.append(column_types[0])
-        elif len(column_types) == 2 and NAME_CLASS_NONE in column_types:
+            updated_column_types.append(column_type_ids[0])
+        elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids:
             # Column with one type and nans
-            filtered_types = [x for x in column_types if x != NAME_CLASS_NONE]
+            filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]]
             updated_column_types.append(filtered_types[0])
         else:
-            if any('str' in column_type_name for column_type_name in column_types):
+            if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_type_ids):
                 # Mixed-types column with string
                 new_column_type = converted_columns[column_id]
-                if new_column_type != 'removed':
+                if new_column_type != -1:
                     updated_column_types.append(new_column_type)
             else:
                 # Mixed-types with float and integer
-                updated_column_types.append(NAME_CLASS_FLOAT)
+                updated_column_types.append(TYPE_TO_ID[float])
 
     return updated_column_types
 
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index d541f94f98..bcbe969a0f 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -29,7 +29,7 @@
 from fedot.preprocessing.base_preprocessing import BasePreprocessor
 from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor
 from fedot.preprocessing.data_type_check import exclude_ts, exclude_multi_ts, exclude_image
-from fedot.preprocessing.data_types import NAME_CLASS_INT, TableTypesCorrector
+from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector
 from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer
 
 # The allowed percent of empty samples in features.
@@ -357,6 +357,7 @@ def _clean_extra_spaces(data: InputData) -> InputData:
         Returns:
             cleaned ``data``
         """
+
         def strip_all_strs(item: Union[object, str]):
             try:
                 return item.strip()
@@ -472,8 +473,8 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra
         encoded_target = data.target
         if encoder is not None:
             # Target encoders have already been fitted
-            data.supplementary_data.column_types['target'] = [NAME_CLASS_INT]
-            encoded_target = encoder.transform(data.target)
+            data.supplementary_data.column_types['target'] = [TYPE_TO_ID[int]]
+            encoded_target = encoder.transform(encoded_target)
             if len(encoded_target.shape) == 1:
                 encoded_target = encoded_target.reshape((-1, 1))
         return encoded_target
diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py
index 1074aff26a..5d5581139e 100644
--- a/test/unit/data/test_supplementary_data.py
+++ b/test/unit/data/test_supplementary_data.py
@@ -9,8 +9,8 @@
 from fedot.core.pipelines.pipeline import Pipeline
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
 from test.unit.tasks.test_regression import get_synthetic_regression_data
-from test.unit.data.test_data_merge import unequal_outputs_table
 
 
 @pytest.fixture()
@@ -19,15 +19,15 @@ def outputs_table_with_different_types():
     task = Task(TaskTypesEnum.regression)
     idx = [0, 1, 2]
     target = [1, 2, 10]
-    data_info_first = SupplementaryData(column_types={'features': ["<class 'str'>", "<class 'float'>"],
-                                                      'target': ["<class 'int'>"]})
+    data_info_first = SupplementaryData(column_types={'features': [TYPE_TO_ID[str], TYPE_TO_ID[float]],
+                                                      'target': [TYPE_TO_ID[int]]})
     output_first = OutputData(idx=idx, features=None,
                               predict=np.array([['a', 1.1], ['b', 2], ['c', 3]], dtype=object),
                               task=task, target=target, data_type=DataTypesEnum.table,
                               supplementary_data=data_info_first)
 
-    data_info_second = SupplementaryData(column_types={'features': ["<class 'float'>"],
-                                                       'target': ["<class 'int'>"]})
+    data_info_second = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]],
+                                                       'target': [TYPE_TO_ID[int]]})
     output_second = OutputData(idx=idx, features=None,
                                predict=np.array([[2.5], [2.1], [9.3]], dtype=float),
                                task=task, target=target, data_type=DataTypesEnum.table,
@@ -124,4 +124,4 @@ def test_define_types_after_merging(outputs_table_with_different_types):
     ancestor_target_type = outputs[0].supplementary_data.column_types['target'][0]
     assert target_types[0] == ancestor_target_type
     assert len(features_types) == 3
-    assert tuple(features_types) == ("<class 'str'>", "<class 'float'>", "<class 'float'>")
+    assert tuple(features_types) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float])
diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py
index 9cdb67fe00..8910f154c0 100644
--- a/test/unit/data_operations/test_data_operations_implementations.py
+++ b/test/unit/data_operations/test_data_operations_implementations.py
@@ -21,8 +21,7 @@
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.operation_types_repository import OperationTypesRepository
 from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams
-from fedot.preprocessing.data_types import NAME_CLASS_FLOAT, NAME_CLASS_INT, \
-    NAME_CLASS_STR
+from fedot.preprocessing.data_types import TYPE_TO_ID
 from test.unit.preprocessing.test_preprocessing_through_api import data_with_only_categorical_features
 
 
@@ -130,7 +129,7 @@ def get_multivariate_time_series(mutli_ts=False):
 
 
 def get_nan_inf_data():
-    supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_FLOAT] * 4})
+    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]] * 4})
     train_input = InputData(idx=[0, 1, 2, 3],
                             features=np.array([[1, 2, 3, 4],
                                                [2, np.nan, 4, 5],
@@ -145,8 +144,8 @@ def get_nan_inf_data():
 
 
 def get_single_feature_data(task=None):
-    supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_INT],
-                                                'target': [NAME_CLASS_INT]})
+    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int]],
+                                                'target': [TYPE_TO_ID[int]]})
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
                             features=np.array([[1], [2], [3], [7], [8], [9]]),
                             target=np.array([[0], [0], [0], [1], [1], [1]]),
@@ -169,10 +168,10 @@ def get_mixed_data(task=None, extended=False):
                              [np.nan, np.nan, '1', np.nan, '2', 'not blue', 'di'],
                              [8, '1', '1', 0, '1', 'not blue', 'da bu'],
                              [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object)
-        features_types = [NAME_CLASS_INT, NAME_CLASS_STR, NAME_CLASS_STR, NAME_CLASS_INT,
-                          NAME_CLASS_STR, NAME_CLASS_STR, NAME_CLASS_STR]
+        features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
+                          TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]
         supp_data = SupplementaryData(column_types={'features': features_types,
-                                                    'target': [NAME_CLASS_INT]})
+                                                    'target': [TYPE_TO_ID[int]]})
     else:
         features = np.array([[1, '0', 1],
                              [2, '1', 0],
@@ -180,9 +179,9 @@ def get_mixed_data(task=None, extended=False):
                              [7, '1', 1],
                              [8, '1', 1],
                              [9, '0', 0]], dtype=object)
-        features_types = [NAME_CLASS_INT, NAME_CLASS_STR, NAME_CLASS_INT]
+        features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]
         supp_data = SupplementaryData(column_types={'features': features_types,
-                                                    'target': [NAME_CLASS_INT]})
+                                                    'target': [TYPE_TO_ID[int]]})
 
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
                             features=features,
@@ -201,7 +200,7 @@ def get_nan_binary_data(task=None):
     Binary int columns must be processed as "almost categorical". Current dataset
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
-    features_types = [NAME_CLASS_INT, NAME_CLASS_STR, NAME_CLASS_INT]
+    features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]
     supp_data = SupplementaryData(column_types={'features': features_types})
     features = np.array([[1, '0', 0],
                          [np.nan, np.nan, np.nan],
@@ -232,8 +231,8 @@ def get_unbalanced_dataset(size=10, disbalance=0.4, target_dim=None):
         target = target.reshape(-1, 1)
 
     supp_data = SupplementaryData(column_types={
-        'features': [NAME_CLASS_INT, NAME_CLASS_STR],
-        'target': [NAME_CLASS_INT]
+        'features': [TYPE_TO_ID[int], TYPE_TO_ID[str]],
+        'target': [TYPE_TO_ID[int]]
     })
 
     input_data = InputData(idx=np.arange(features.shape[0]),
@@ -252,7 +251,7 @@ def data_with_binary_int_features_and_equal_categories():
     must be processed as "almost categorical". Current dataset
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
-    supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_INT, NAME_CLASS_INT]})
+    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int], TYPE_TO_ID[int]]})
     task = Task(TaskTypesEnum.classification)
     features = np.array([[1, 10],
                          [np.nan, np.nan],
diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py
index c1bbed9592..3b0e60fc25 100644
--- a/test/unit/preprocessing/test_preprocessing_through_api.py
+++ b/test/unit/preprocessing/test_preprocessing_through_api.py
@@ -6,12 +6,12 @@
 from fedot.core.data.supplementary_data import SupplementaryData
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
-from fedot.preprocessing.data_types import NAME_CLASS_STR
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 def data_with_only_categorical_features():
     """ Generate tabular data with only categorical features. All of them are binary. """
-    supp_data = SupplementaryData(column_types={'features': [NAME_CLASS_STR] * 3})
+    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[str]] * 3})
     task = Task(TaskTypesEnum.regression)
     features = np.array([["'a'", "0", "1"],
                          ["'b'", "1", "0"],
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index 038b9f44af..81fa8bb74b 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -9,6 +9,7 @@
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import TaskTypesEnum, Task
 from fedot.core.utils import fedot_project_root
+from fedot.preprocessing.data_types import TYPE_TO_ID
 from fedot.preprocessing.data_types import TableTypesCorrector, apply_type_transformation
 from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME
 from test.unit.preprocessing.test_pipeline_preprocessing import data_with_mixed_types_in_each_column, \
@@ -133,10 +134,10 @@ def test_column_types_converting_correctly():
 
     assert len(features_types) == 4
     assert len(target_types) == 2
-    assert features_types[0] == "<class 'str'>"
-    assert features_types[1] == "<class 'str'>"
-    assert features_types[2] == "<class 'str'>"
-    assert target_types[0] == target_types[0] == "<class 'str'>"
+    assert features_types[0] == TYPE_TO_ID[str]
+    assert features_types[1] == TYPE_TO_ID[str]
+    assert features_types[2] == TYPE_TO_ID[str]
+    assert target_types[0] == target_types[1] == TYPE_TO_ID[str]
 
 
 def test_column_types_process_correctly():
@@ -158,7 +159,7 @@ def test_column_types_process_correctly():
     features_columns = predicted.supplementary_data.column_types['features']
     assert len(features_columns) == predicted.predict.shape[1]
     # All output values are float
-    assert all('float' in str(feature_type) for feature_type in features_columns)
+    assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_columns)
 
 
 def test_complicated_table_types_processed_correctly():
@@ -263,7 +264,7 @@ def test_str_numbers_with_dots_and_commas_in_predict():
     input_data = InputData(idx=np.arange(4),
                            features=features, target=target, task=task, data_type=DataTypesEnum.table)
 
-    transformed_predict = apply_type_transformation(table=input_data.features, column_types=['int'],
+    transformed_predict = apply_type_transformation(table=input_data.features, column_types=[TYPE_TO_ID[int]],
                                                     log=default_log('test_str_numbers_with_dots_and_commas_in_predict'))
 
     assert all(transformed_predict == np.array([[8], [4], [3], [6]]))

From d2d0f9e74eac7bafc6572b49c961e140d404c3a0 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 23 Jan 2023 16:38:07 +0300
Subject: [PATCH 16/72] PR fixes

---
 fedot/core/operations/model.py    | 138 +++++++++++++++---------------
 fedot/preprocessing/data_types.py |   4 +-
 2 files changed, 71 insertions(+), 71 deletions(-)

diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py
index 8ecd309fd8..1499c05a47 100644
--- a/fedot/core/operations/model.py
+++ b/fedot/core/operations/model.py
@@ -1,69 +1,69 @@
-import numpy as np
-
-from fedot.core.data.data import OutputData
-from fedot.core.operations.operation import Operation
-from fedot.core.repository.dataset_types import DataTypesEnum
-from fedot.core.repository.operation_types_repository import OperationTypesRepository
-from fedot.core.repository.tasks import TaskTypesEnum
-from fedot.preprocessing.data_types import TYPE_TO_ID
-
-
-class Model(Operation):
-    """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
-
-    Args:
-        operation_type: name of the model
-    """
-
-    def __init__(self, operation_type: str):
-        super().__init__(operation_type=operation_type)
-        self.operations_repo = OperationTypesRepository('model')
-
-    @staticmethod
-    def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
-        """Assign types for tabular data obtained from model predictions.\n
-        By default, all types of model predictions for tabular data can be clearly defined
-        """
-        if output_data.data_type is not DataTypesEnum.table:
-            # No column data types info for non-tabular data
-            return output_data
-
-        is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
-        is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting
-
-        predict_shape = np.array(output_data.predict).shape
-        # Add information about features
-        if is_regression_task or is_ts_forecasting_task:
-            if len(predict_shape) < 2:
-                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
-            else:
-                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
-        else:
-            if len(predict_shape) < 2:
-                output_data.predict = output_data.predict.reshape((-1, 1))
-                predict_shape = output_data.predict.shape
-            # Classification task or clustering
-            if output_mode == 'labels':
-                column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]}
-            else:
-                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
-
-        # Add information about target
-        target_shape = output_data.target.shape if output_data.target is not None else None
-        if target_shape is None:
-            # There is no target column in output data
-            output_data.supplementary_data.column_types = column_info
-            return output_data
-
-        if is_regression_task or is_ts_forecasting_task:
-            if len(target_shape) > 1:
-                column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]})
-            else:
-                # Array present "time series"
-                column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)})
-        else:
-            # Classification task or clustering
-            column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]})
-
-        output_data.supplementary_data.column_types = column_info
-        return output_data
+import numpy as np
+
+from fedot.core.data.data import OutputData
+from fedot.core.operations.operation import Operation
+from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.core.repository.operation_types_repository import OperationTypesRepository
+from fedot.core.repository.tasks import TaskTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
+
+
+class Model(Operation):
+    """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
+
+    Args:
+        operation_type: name of the model
+    """
+
+    def __init__(self, operation_type: str):
+        super().__init__(operation_type=operation_type)
+        self.operations_repo = OperationTypesRepository('model')
+
+    @staticmethod
+    def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
+        """Assign types for tabular data obtained from model predictions.\n
+        By default, all types of model predictions for tabular data can be clearly defined
+        """
+        if output_data.data_type is not DataTypesEnum.table:
+            # No column data types info for non-tabular data
+            return output_data
+
+        is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
+        is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting
+
+        predict_shape = np.array(output_data.predict).shape
+        # Add information about features
+        if is_regression_task or is_ts_forecasting_task:
+            if len(predict_shape) < 2:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
+            else:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+        else:
+            if len(predict_shape) < 2:
+                output_data.predict = output_data.predict.reshape((-1, 1))
+                predict_shape = output_data.predict.shape
+            # Classification task or clustering
+            if output_mode == 'labels':
+                column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]}
+            else:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+
+        # Add information about target
+        target_shape = output_data.target.shape if output_data.target is not None else None
+        if target_shape is None:
+            # There is no target column in output data
+            output_data.supplementary_data.column_types = column_info
+            return output_data
+
+        if is_regression_task or is_ts_forecasting_task:
+            if len(target_shape) > 1:
+                column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]})
+            else:
+                # Array present "time series"
+                column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)})
+        else:
+            # Classification task or clustering
+            column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]})
+
+        output_data.supplementary_data.column_types = column_info
+        return output_data
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index e233fa88a5..cdf6ecf708 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -425,7 +425,7 @@ def define_column_types(table: np.ndarray):
     nans = pd.isna(table)
     table_of_types = np.empty_like(table, dtype=np.int8)
     table_of_types[~nans] = [
-        TYPE_TO_ID[type(x.item() if getattr(x, 'item', False) else x)]
+        TYPE_TO_ID[type(x.item() if isinstance(x, (np.ndarray, np.generic)) else x)]
         for x in table[~nans]
     ]
     table_of_types[nans] = TYPE_TO_ID[type(None)]
@@ -447,7 +447,7 @@ def define_column_types(table: np.ndarray):
             ]
 
             # Store information about nans in the target
-            nan_ids = np.ravel(np.argwhere(col_types == TYPE_TO_ID[type(None)]))
+            nan_ids = np.where(nans[:, column_id])[0]
             columns_info.update({column_id: {'types': unique_col_types,
                                              'str_number': str_number,
                                              'int_number': int_number,

From 466a3eaf7b790abe68190100fa1747ab4ecc81d4 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 9 Feb 2023 15:01:22 +0300
Subject: [PATCH 17/72] style fixes

---
 fedot/core/data/data_preprocessing.py          |  3 +--
 .../data_operations/categorical_encoders.py    |  6 +++---
 fedot/core/operations/model.py                 |  6 ++----
 fedot/preprocessing/data_types.py              | 18 ++++++++----------
 test/unit/preprocessing/test_preprocessors.py  |  6 +++---
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 9f4455d87b..92960e5283 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -30,8 +30,7 @@ def convert_into_column(array: np.ndarray) -> np.ndarray:
     """ Perform conversion for data if it is necessary """
     if len(array.shape) == 1:
         return array.reshape(-1, 1)
-    else:
-        return array
+    return array
 
 
 def divide_data_categorical_numerical(input_data: InputData, categorical_ids: list,
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 58ee267622..82c3767dad 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -36,9 +36,9 @@ def fit(self, input_data: InputData):
         :return encoder: trained encoder (optional output)
         """
         features = input_data.features
-        features_type_ids = input_data.supplementary_data.column_types.get('features')
+        features_types_ids = input_data.supplementary_data.column_types.get('features')
         categorical_ids, non_categorical_ids = find_categorical_columns(features,
-                                                                        features_type_ids)
+                                                                        features_types_ids)
 
         # Indices of columns with categorical and non-categorical features
         self.categorical_ids = categorical_ids
@@ -84,7 +84,7 @@ def _update_column_types(self, output_data: OutputData):
 
             # Calculate new binary columns number after encoding
             encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
-            numerical_columns.extend([TYPE_TO_ID[int]] * encoded_columns_number)
+            numerical_columns += [TYPE_TO_ID[int]] * encoded_columns_number
 
             output_data.encoded_idx = self.encoded_ids
             output_data.supplementary_data.column_types['features'] = numerical_columns
diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py
index 1499c05a47..9e38f5e27c 100644
--- a/fedot/core/operations/model.py
+++ b/fedot/core/operations/model.py
@@ -43,10 +43,8 @@ def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> Ou
                 output_data.predict = output_data.predict.reshape((-1, 1))
                 predict_shape = output_data.predict.shape
             # Classification task or clustering
-            if output_mode == 'labels':
-                column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]}
-            else:
-                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+            target_type = int if output_mode == 'labels' else float
+            column_info = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]}
 
         # Add information about target
         target_shape = output_data.target.shape if output_data.target is not None else None
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index cdf6ecf708..1082ea43e9 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from copy import copy
+from typing import TYPE_CHECKING, Tuple
 
 import numpy as np
 import pandas as pd
@@ -8,13 +9,10 @@
 
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 
-NoneType = type(None)
-from typing import TYPE_CHECKING, Tuple
-
 if TYPE_CHECKING:
     from fedot.core.data.data import InputData
 
-_convertable_types = (bool, float, int, str, type(None))
+_convertable_types = (bool, float, int, str, type(None))  # preserve lexicographical order
 _types_ids = range(len(_convertable_types))
 
 TYPE_TO_ID = dict(zip(_convertable_types, _types_ids))
@@ -543,17 +541,17 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict)
     """
     updated_column_types = []
     for column_id, column_info in columns_types_info.items():
-        column_type_ids = column_info['types']
+        column_types_ids = column_info['types']
 
-        if len(column_type_ids) == 1:
+        if len(column_types_ids) == 1:
             # Column initially contain only one type
-            updated_column_types.append(column_type_ids[0])
-        elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids:
+            updated_column_types.append(column_types_ids[0])
+        elif len(column_types_ids) == 2 and TYPE_TO_ID[type(None)] in column_types_ids:
             # Column with one type and nans
-            filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]]
+            filtered_types = [x for x in column_types_ids if x != TYPE_TO_ID[type(None)]]
             updated_column_types.append(filtered_types[0])
         else:
-            if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_type_ids):
+            if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_types_ids):
                 # Mixed-types column with string
                 new_column_type = converted_columns[column_id]
                 if new_column_type != -1:
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index 81fa8bb74b..562e821a3e 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -156,10 +156,10 @@ def test_column_types_process_correctly():
     pipeline.fit(train_data)
     predicted = pipeline.predict(test_data)
 
-    features_columns = predicted.supplementary_data.column_types['features']
-    assert len(features_columns) == predicted.predict.shape[1]
+    features_types_ids = predicted.supplementary_data.column_types['features']
+    assert len(features_types_ids) == predicted.predict.shape[1]
     # All output values are float
-    assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_columns)
+    assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_types_ids)
 
 
 def test_complicated_table_types_processed_correctly():

From 39785144636057cb6dbd3dc29eefc5290c12b2e4 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 9 Feb 2023 15:03:29 +0300
Subject: [PATCH 18/72] array creation via multiplication fix

---
 .../data_operations/sklearn_transformations.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index 3b7e4b49b3..47f77ba9e0 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -89,7 +89,7 @@ def update_column_types(output_data: OutputData) -> OutputData:
         """
 
         _, n_cols = output_data.predict.shape
-        output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float] * n_cols]
+        output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float]] * n_cols
         return output_data
 
 
@@ -197,7 +197,7 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             if cols_number_added > 0:
                 # There are new columns in the table
                 col_types = output_data.supplementary_data.column_types['features']
-                col_types.extend([TYPE_TO_ID[float]] * cols_number_added)
+                col_types += [TYPE_TO_ID[float]] * cols_number_added
                 output_data.supplementary_data.column_types['features'] = col_types
 
 

From f5e1589b2974409e4a544aaf565bb0e47166bd97 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 15 Feb 2023 13:30:56 +0300
Subject: [PATCH 19/72] unified unimodal methods

---
 fedot/preprocessing/base_preprocessing.py |   4 +-
 fedot/preprocessing/preprocessing.py      | 105 +++++++---------------
 2 files changed, 31 insertions(+), 78 deletions(-)

diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
index 106e263501..a3244559c7 100644
--- a/fedot/preprocessing/base_preprocessing.py
+++ b/fedot/preprocessing/base_preprocessing.py
@@ -33,10 +33,8 @@ def __init__(self):
         self.features_encoders: Dict[str, Union[OneHotEncodingImplementation, LabelEncodingImplementation]] = {}
         self.use_label_encoder: bool = False
         self.features_imputers: Dict[str, ImputationImplementation] = {}
-        self.ids_relevant_features: Dict[str, List[int]] = {}
+        self.ids_relevant_features: Dict[str, np.ndarray] = {}
 
-        # Cannot be processed due to incorrect types or large number of nans
-        self.ids_incorrect_features: Dict[str, List[int]] = {}
         # Categorical preprocessor for binary categorical features
         self.binary_categorical_processors: Dict[str, BinaryCategoricalPreprocessor] = {}
         self.types_correctors: Dict[str, TableTypesCorrector] = {}
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index bcbe969a0f..04bcf106a5 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -107,13 +107,12 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) ->
         self._init_supplementary_preprocessors(data)
 
         if isinstance(data, InputData):
-            data = self._prepare_obligatory_unimodal_for_fit(data, source_name=DEFAULT_SOURCE_NAME)
+            data = self._prepare_obligatory_unimodal(data, source_name=DEFAULT_SOURCE_NAME)
 
         elif isinstance(data, MultiModalData):
             self._init_main_target_source_name(data)
             for data_source_name, values in data.items():
-                data[data_source_name] = self._prepare_obligatory_unimodal_for_fit(values,
-                                                                                   source_name=data_source_name)
+                data[data_source_name] = self._prepare_obligatory_unimodal(values, source_name=data_source_name)
 
         BasePreprocessor.mark_as_preprocessed(data)
         return data
@@ -122,12 +121,12 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) ->
     def obligatory_prepare_for_predict(self,
                                        data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]:
         if isinstance(data, InputData):
-            data = self._prepare_obligatory_unimodal_for_predict(data, source_name=DEFAULT_SOURCE_NAME)
+            data = self._prepare_obligatory_unimodal(data, source_name=DEFAULT_SOURCE_NAME, is_fit_stage=False)
 
         elif isinstance(data, MultiModalData):
             for data_source_name, values in data.items():
-                data[data_source_name] = self._prepare_obligatory_unimodal_for_predict(values,
-                                                                                       source_name=data_source_name)
+                data[data_source_name] = self._prepare_obligatory_unimodal(values, source_name=data_source_name,
+                                                                           is_fit_stage=False)
 
         BasePreprocessor.mark_as_preprocessed(data)
         return data
@@ -170,13 +169,14 @@ def _take_only_correct_features(self, data: InputData, source_name: str):
             source_name: name of the data source node
         """
         current_relevant_ids = self.ids_relevant_features[source_name]
-        if current_relevant_ids:
+        if len(current_relevant_ids):
             data.features = data.features[:, current_relevant_ids]
 
     @exclude_ts
     @exclude_multi_ts
     @exclude_image
-    def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str) -> InputData:
+    def _prepare_obligatory_unimodal(self, data: InputData, source_name: str,
+                                     *, is_fit_stage: bool = True) -> InputData:
         """
         Processes InputData for pipeline fit method
 
@@ -204,19 +204,22 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
         replace_inf_with_nans(data)
 
         # Find incorrect features which must be removed
-        self._find_features_full_of_nans(data, source_name)
+        if is_fit_stage:
+            self._find_features_lacking_nans(data, source_name)
         self._take_only_correct_features(data, source_name)
-        data = self._drop_rows_with_nan_in_target(data)
-
-        # Column types processing - launch after correct features selection
-        self.types_correctors[source_name].convert_data_for_fit(data)
-        if self.types_correctors[source_name].target_converting_has_errors:
+        if is_fit_stage:
             data = self._drop_rows_with_nan_in_target(data)
 
-        # Train Label Encoder for categorical target if necessary and apply it
-        if source_name not in self.target_encoders:
-            self._train_target_encoder(data, source_name)
-        data.target = self._apply_target_encoding(data, source_name)
+            # Column types processing - launch after correct features selection
+            self.types_correctors[source_name].convert_data_for_fit(data)
+            if self.types_correctors[source_name].target_converting_has_errors:
+                data = self._drop_rows_with_nan_in_target(data)
+            # Train Label Encoder for categorical target if necessary and apply it
+            if source_name not in self.target_encoders:
+                self._train_target_encoder(data, source_name)
+            data.target = self._apply_target_encoding(data, source_name)
+        else:
+            self.types_correctors[source_name].convert_data_for_predict(data)
 
         # TODO andreygetmanov target encoding must be obligatory for all data types
         if data_type_is_text(data):
@@ -225,49 +228,10 @@ def _prepare_obligatory_unimodal_for_fit(self, data: InputData, source_name: str
         elif data_type_is_table(data):
             data = self._clean_extra_spaces(data)
             # Process binary categorical features
-            data = self.binary_categorical_processors[source_name].fit_transform(data)
-
-        return data
-
-    @exclude_ts
-    @exclude_multi_ts
-    @exclude_image
-    def _prepare_obligatory_unimodal_for_predict(self, data: InputData, source_name: str) -> InputData:
-        """
-        Processes InputData for pipeline predict method
-
-        Args:
-            data: to be preprocessed
-            source_name: name of the data source node
-
-        Returns:
-            obligatory-prepared data
-        """
-        if data.supplementary_data.obligatorily_preprocessed:
-            # Preprocessing was already done - return data
-            return data
-
-        # Convert datetime data to numerical
-        data.features = np_datetime_to_numeric(data.features)
-        if data.target is not None:
-            data.target = np_datetime_to_numeric(data.target)
-
-        # Wrap indices in numpy array
-        data.idx = np.array(data.idx)
-
-        # Fix tables / time series sizes
-        data = self._correct_shapes(data)
-        replace_inf_with_nans(data)
-
-        # Perform preprocessing for types - launch after correct features selection
-        self._take_only_correct_features(data, source_name)
-        self.types_correctors[source_name].convert_data_for_predict(data)
-
-        if data_type_is_text(data):
-            replace_nans_with_empty_strings(data)
-        if data_type_is_table(data):
-            data = self._clean_extra_spaces(data)
-            data = self.binary_categorical_processors[source_name].transform(data)
+            if is_fit_stage:
+                data = self.binary_categorical_processors[source_name].fit_transform(data)
+            else:
+                data = self.binary_categorical_processors[source_name].transform(data)
 
         return data
 
@@ -294,27 +258,18 @@ def _prepare_optional(self, pipeline, data: InputData, source_name: str):
                 if not has_tag:
                     data = action_if_no_tag(data, source_name)
 
-    def _find_features_full_of_nans(self, data: InputData, source_name: str):
+    def _find_features_lacking_nans(self, data: InputData, source_name: str):
         """
-        Finds features with more than ALLOWED_NAN_PERCENT of nan's
+        Finds features with less than ALLOWED_NAN_PERCENT of nan's
 
         Args:
             data: data to find columns with nan values
             source_name: name of the data source node
         """
-        # Initialize empty lists to fill it with indices
-        self.ids_relevant_features[source_name] = []
-        self.ids_incorrect_features[source_name] = []
-
         features = data.features
-        n_samples, n_columns = features.shape
-
-        for i in range(n_columns):
-            feature = features[:, i]
-            if np.sum(pd.isna(feature)) / n_samples < ALLOWED_NAN_PERCENT:
-                self.ids_relevant_features[source_name].append(i)
-            else:
-                self.ids_incorrect_features[source_name].append(i)
+        axes_except_cols = (0,) + tuple(range(2, features.ndim))
+        are_allowed = np.mean(pd.isna(features), axis=axes_except_cols) < ALLOWED_NAN_PERCENT
+        self.ids_relevant_features[source_name] = np.nonzero(are_allowed)[0]
 
     @staticmethod
     def _drop_rows_with_nan_in_target(data: InputData) -> InputData:

From 806acd1f83a14727ea3e255944ffbb694bd5423e Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 15 Feb 2023 13:45:16 +0300
Subject: [PATCH 20/72] +OperationTypesRepository type in operation.py

---
 fedot/core/operations/operation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py
index 422aae58ce..3d4ae4134d 100644
--- a/fedot/core/operations/operation.py
+++ b/fedot/core/operations/operation.py
@@ -8,6 +8,7 @@
 from fedot.core.operations.hyperparameters_preprocessing import HyperparametersPreprocessor
 from fedot.core.operations.operation_parameters import OperationParameters
 from fedot.core.repository.operation_types_repository import OperationMetaInfo
+from fedot.core.repository.operation_types_repository import OperationTypesRepository
 from fedot.core.repository.tasks import Task, TaskTypesEnum, compatible_task_types
 from fedot.utilities.custom_errors import AbstractMethodNotImplementError
 
@@ -25,7 +26,7 @@ def __init__(self, operation_type: str, **kwargs):
         self.operation_type = operation_type
 
         self._eval_strategy = None
-        self.operations_repo = None
+        self.operations_repo: OperationTypesRepository = None
         self.fitted_operation = None
 
         self.log = default_log(self)
@@ -163,7 +164,7 @@ def to_json(self) -> Dict[str, Any]:
 
 
 def _eval_strategy_for_task(operation_type: str, current_task_type: TaskTypesEnum,
-                            operations_repo):
+                            operations_repo: OperationTypesRepository):
     """The function returns the strategy for the selected operation and task type.
     And if it is necessary, found acceptable strategy for operation
 

From d6dd5a9c898ed70cbf5b5afb1658457781a1eb4a Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 15 Feb 2023 13:47:24 +0300
Subject: [PATCH 21/72] add safer version of enum/strategies imports

---
 fedot/core/repository/json_evaluation.py      | 79 ++++++++++++-------
 .../repository/operation_types_repository.py  | 46 ++++++-----
 test/integration/models/test_repository.py    |  8 +-
 3 files changed, 79 insertions(+), 54 deletions(-)

diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py
index 82948c3849..3ab0a96e93 100644
--- a/fedot/core/repository/json_evaluation.py
+++ b/fedot/core/repository/json_evaluation.py
@@ -1,37 +1,58 @@
-from typing import Union
+from importlib import import_module
+from typing import Union, TYPE_CHECKING, List
 
 # imports are required for the eval
-from fedot.core.repository.dataset_types import *
-from fedot.core.repository.tasks import *
+from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.core.repository.tasks import TaskTypesEnum
+
+if TYPE_CHECKING:
+    from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy
 
 
 def read_field(source: dict, field_name: str, default: list):
-    """ Function for reading field in the dictionary
+    """
+    Function for reading field in the dictionary
+
+    Args:
+        source: dictionary with information
+        field_name: name of the looked up field in the ``source``
+        default: default list if ``field_name`` is not in the source dict keys
+
+    Returns:
+        list with field values
+    """
+    field_value = source.get(field_name, default)
+    if isinstance(field_value, str):
+        return import_enums_from_str(field_value)
+    return field_value
+
+
+def import_enums_from_str(field_value: str) -> Union[List[DataTypesEnum],
+                                                     List[TaskTypesEnum]]:
+    """
+    Imports enums by theirs string name representation and returns list of theirs values
+
+    Args:
+        field_value: str representing list of
+            either class:`DataTypesEnum` or class:`TaskTypesEnum` values
+
+    Returns:
+        list of either class:`DataTypesEnum` or class:`TaskTypesEnum` values
+    """
+    enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val != '']
+    return [
+        getattr(globals()[data_type], value)
+        for (data_type, value) in enums]
+
+
+def import_strategy_from_str(field_value: List[str]) -> 'EvaluationStrategy':
+    """
+    Imports evaluation strategy module and returns its particular type
 
-    :param source: dictionary with information
-    :param field_name: name of the field for searching for in it
-    :param default: default list if field_name is not in the source dict keys
+    Args:
+        field_value: list of [namespace, type_name]
 
-    :return : list with field values
+    Returns:
+        specific evaluation strategy
     """
-    if field_name in source.keys():
-        field_value = source[field_name]
-        if isinstance(field_value, str):
-            return eval_field_str(field_value)
-        else:
-            return field_value
-    else:
-        return default
-
-
-def eval_field_str(field_value) -> Union[List[DataTypesEnum],
-                                         List[TaskTypesEnum]]:
-    # TODO add docstring
-    return eval(field_value)
-
-
-def eval_strategy_str(field_value):
-    # TODO add docstring
-    namespace = field_value[0]
-    exec(f'from {namespace} import {field_value[1]}')
-    return eval(field_value[1])
+    return getattr(import_module(field_value[0]), field_value[1])
diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py
index 2228c32563..d300f10f15 100644
--- a/fedot/core/repository/operation_types_repository.py
+++ b/fedot/core/repository/operation_types_repository.py
@@ -2,7 +2,7 @@
 import os
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional, Union, TYPE_CHECKING
 
 import numpy as np
 from golem.core.log import default_log
@@ -10,9 +10,12 @@
 
 from fedot.core.constants import BEST_QUALITY_PRESET_NAME, AUTO_PRESET_NAME
 from fedot.core.repository.dataset_types import DataTypesEnum
-from fedot.core.repository.json_evaluation import eval_field_str, eval_strategy_str, read_field
+from fedot.core.repository.json_evaluation import import_enums_from_str, import_strategy_from_str, read_field
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 
+if TYPE_CHECKING:
+    from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy
+
 AVAILABLE_REPO_NAMES = ['all', 'model', 'data_operation', 'automl']
 
 
@@ -22,14 +25,14 @@ class OperationMetaInfo:
     input_types: List[DataTypesEnum]
     output_types: List[DataTypesEnum]
     task_type: List[TaskTypesEnum]
-    supported_strategies: Any
+    supported_strategies: Union['EvaluationStrategy', Dict[str, 'EvaluationStrategy']]
     allowed_positions: List[str]
     tags: Optional[List[str]] = None
     presets: Optional[List[str]] = None
 
-    def current_strategy(self, task: TaskTypesEnum):
-        """Method allows getting available processing strategies depending on the
-        selected task
+    def current_strategy(self, task: TaskTypesEnum) -> Optional['EvaluationStrategy']:
+        """
+        Gets available processing strategies depending on the selected task
 
         Args:
             task: machine learning task (e.g. regression and classification)
@@ -176,13 +179,9 @@ def _initialise_repo(cls, repo_path: str) -> List[OperationMetaInfo]:
             properties = operations_json.get(current_operation_key)
             metadata = metadata_json[properties['meta']]
 
-            task_types = eval_field_str(metadata['tasks'])
-            input_type = eval_field_str(properties['input_type']) \
-                if ('input_type' in properties) \
-                else eval_field_str(metadata['input_type'])
-            output_type = eval_field_str(properties['output_type']) \
-                if ('output_type' in properties) \
-                else eval_field_str(metadata['output_type'])
+            task_types = import_enums_from_str(metadata['tasks'])
+            input_type = import_enums_from_str(properties.get('input_type', metadata.get('input_type')))
+            output_type = import_enums_from_str(properties.get('output_type', metadata.get('output_type')))
 
             # Get available strategies for obtained metadata
             supported_strategies = OperationTypesRepository.get_strategies_by_metadata(metadata)
@@ -219,24 +218,29 @@ def _initialise_repo(cls, repo_path: str) -> List[OperationMetaInfo]:
         return operations_list
 
     @staticmethod
-    def get_strategies_by_metadata(metadata: dict):
-        """Method allow obtain strategy instance by the metadata
+    def get_strategies_by_metadata(metadata: dict) -> Union['EvaluationStrategy', Dict[str, 'EvaluationStrategy']]:
+        """
+        Obtains strategy instance by the metadata
 
         Args:
             metadata: information about meta of the operation
-            supported_strategies: available strategies for current metadata
+        
+        Returns:
+            available strategies for current metadata
         """
         strategies_json = metadata['strategies']
         if isinstance(strategies_json, list):
-            supported_strategies = eval_strategy_str(strategies_json)
-        else:
+            supported_strategies = import_strategy_from_str(strategies_json)
+        elif isinstance(strategies_json, dict):
             supported_strategies = {}
-            for strategy_dict_key in strategies_json.keys():
+            for strategy_dct_key, strategy_str_value in strategies_json.items():
                 # Convert string into class path for import
-                import_path = eval_field_str(strategy_dict_key)
-                strategy_class = eval_strategy_str(strategies_json[strategy_dict_key])
+                import_path = import_enums_from_str(strategy_dct_key)
+                strategy_class = import_strategy_from_str(strategy_str_value)
 
                 supported_strategies.update({import_path: strategy_class})
+        else:
+            raise TypeError('strategies are of unknown type')
         return supported_strategies
 
     def operation_info_by_id(self, operation_id: str) -> Optional[OperationMetaInfo]:
diff --git a/test/integration/models/test_repository.py b/test/integration/models/test_repository.py
index 418134e23f..39eee3afd1 100644
--- a/test/integration/models/test_repository.py
+++ b/test/integration/models/test_repository.py
@@ -2,8 +2,8 @@
 import os
 
 from fedot.core.operations.evaluation.classification import SkLearnClassificationStrategy
-from fedot.core.repository.json_evaluation import eval_field_str, \
-    eval_strategy_str, read_field
+from fedot.core.repository.json_evaluation import import_enums_from_str, \
+    import_strategy_from_str, read_field
 from fedot.core.repository.operation_types_repository import (OperationTypesRepository,
                                                               get_operation_type_from_id)
 from fedot.core.repository.pipeline_operation_repository import PipelineOperationRepository
@@ -48,7 +48,7 @@ def test_search_in_repository_by_tag_correct():
 
 def test_eval_field_str():
     model_metadata = _model_metadata_example(mocked_path())
-    task_types = eval_field_str(model_metadata['tasks'])
+    task_types = import_enums_from_str(model_metadata['tasks'])
 
     assert len(task_types) == 1
     assert task_types[0] == TaskTypesEnum.classification
@@ -59,7 +59,7 @@ def test_eval_strategy_str():
 
     strategies_json = model_metadata['strategies']
 
-    strategy = eval_strategy_str(strategies_json)
+    strategy = import_strategy_from_str(strategies_json)
     assert strategy is SkLearnClassificationStrategy
 
 

From b6ecf9a055650a94f7205b94f9bfe3c67254b15d Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Fri, 17 Feb 2023 14:29:49 +0300
Subject: [PATCH 22/72] optimizations and style fixes

---
 fedot/core/data/data_preprocessing.py         |  6 +--
 .../data_operations/categorical_encoders.py   | 14 +++---
 fedot/core/repository/json_evaluation.py      |  4 +-
 fedot/preprocessing/categorical.py            | 50 ++++++-------------
 fedot/preprocessing/data_types.py             | 10 ++--
 5 files changed, 31 insertions(+), 53 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 92960e5283..10108e39bd 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
 
 def replace_inf_with_nans(input_data: InputData):
     features = input_data.features
-    has_infs = (features == np.inf) | (features == -np.inf)
-    if np.any(has_infs):
-        features[has_infs] = np.nan
+    inf_idxs: Tuple[np.ndarray, ...] = ((features == np.inf) | (features == -np.inf)).nonzero()
+    if len(inf_idxs[0]):
+        features[inf_idxs] = np.nan
 
 
 def replace_nans_with_empty_strings(input_data: InputData):
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 82c3767dad..27607c00f2 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import Optional
+from typing import Optional, Tuple
 
 import numpy as np
 import pandas as pd
@@ -143,9 +143,9 @@ def transform(self, input_data: InputData) -> OutputData:
             # If categorical features are exists - transform them inplace in InputData
             for categorical_id in self.categorical_ids:
                 categorical_column = input_data.features[:, categorical_id]
-                has_nan: np.ndarray = pd.isna(categorical_column)
+                nan_idxs: Tuple[np.ndarray, ...] = pd.isna(categorical_column).nonzero()
 
-                transformed = self._apply_label_encoder(categorical_column, categorical_id, has_nan)
+                transformed = self._apply_label_encoder(categorical_column, categorical_id, nan_idxs)
                 copied_data.features[:, categorical_id] = transformed
 
         output_data = self._convert_to_output(copied_data,
@@ -174,21 +174,21 @@ def _fit_label_encoders(self, input_data: InputData):
             self.encoders.update({categorical_id: le})
 
     def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int,
-                             has_nan: np.ndarray) -> np.ndarray:
+                             nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray:
         """ Apply fitted LabelEncoder for column transformation
 
         :param categorical_column: numpy array with categorical features
         :param categorical_id: index of current categorical column
-        :param has_nan: bool array of gap elements in the ``categorical_column``
+        :param nan_idxs: indices of gap elements in the ``categorical_column``
         """
         column_encoder = self.encoders[categorical_id]
         column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column)))
 
         transformed_column = column_encoder.transform(categorical_column)
-        if len(has_nan) > 0:
+        if len(nan_idxs[0]):
             # Store np.nan values
             transformed_column = transformed_column.astype(object)
-            transformed_column[has_nan] = np.nan
+            transformed_column[nan_idxs] = np.nan
 
         return transformed_column
 
diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py
index 3ab0a96e93..ba4483ce0e 100644
--- a/fedot/core/repository/json_evaluation.py
+++ b/fedot/core/repository/json_evaluation.py
@@ -1,7 +1,7 @@
 from importlib import import_module
 from typing import Union, TYPE_CHECKING, List
 
-# imports are required for the eval
+# imports are required beneath in the function
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import TaskTypesEnum
 
@@ -39,7 +39,7 @@ def import_enums_from_str(field_value: str) -> Union[List[DataTypesEnum],
     Returns:
         list of either class:`DataTypesEnum` or class:`TaskTypesEnum` values
     """
-    enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val != '']
+    enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val]
     return [
         getattr(globals()[data_type], value)
         for (data_type, value) in enums]
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 09368286c5..fe5fa1eeaf 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -33,26 +33,23 @@ def fit(self, input_data: InputData):
             return self
 
         binary_ids_to_convert = []
-        number_of_columns = input_data.features.shape[-1]
-        for column_id in range(number_of_columns):
-            pd_column = pd.Series(input_data.features[:, column_id], copy=True)
-            has_nan = pd_column.isna()
-            if has_nan.sum() and column_id in categorical_ids:
+        for column_id, column in enumerate(input_data.features.T):
+            pd_column = pd.Series(column, copy=True)
+            is_nan = pd_column.isna()
+            column_uniques = pd_column.unique()
+            if is_nan.sum() and column_id in categorical_ids:
                 # This categorical column has nans
-                replaced_column, _ = replace_nans_with_fedot_nans(pd_column, has_nan)
-                column_uniques = replaced_column.unique()
+                pd_column[is_nan] = FEDOT_STR_NAN
 
                 if len(column_uniques) <= 3:
                     # There is column with binary categories and gaps
                     self.binary_features_with_nans.append(column_id)
                     binary_ids_to_convert.append(column_id)
-                    self._train_encoder(replaced_column, column_id)
+                    self._train_encoder(pd_column, column_id)
             else:
-                column_uniques = pd_column.unique()
                 if len(column_uniques) <= 2 and column_id in categorical_ids:
                     # Column contains binary string feature
                     binary_ids_to_convert.append(column_id)
-
                     # Train encoder for current column
                     self._train_encoder(pd_column, column_id)
 
@@ -67,26 +64,15 @@ def transform(self, input_data: InputData) -> InputData:
             # There are no binary categorical features
             return input_data
 
-        converted_features = []
-        number_of_columns = input_data.features.shape[-1]
-        for column_id in range(number_of_columns):
+        copied_data = deepcopy(input_data)
+        for column_id, column in enumerate(copied_data.features.T):
             if column_id in self.binary_ids_to_convert:
                 # If column contains nans - replace them with fedot nans special string
-                pd_column = pd.Series(input_data.features[:, column_id])
-                has_nan = pd_column.isna()
-                replaced_column, has_nan = replace_nans_with_fedot_nans(pd_column, has_nan)
+                nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
+                column[nan_idxs] = FEDOT_STR_NAN
 
                 # Convert into integers
-                converted_column = self._apply_encoder(replaced_column, column_id, has_nan)
-            else:
-                # Stay column the same
-                converted_column = input_data.features[:, column_id]
-
-            converted_features.append(converted_column.reshape((-1, 1)))
-
-        # Store transformed features
-        copied_data = deepcopy(input_data)
-        copied_data.features = np.hstack(converted_features)
+                column[:] = self._apply_encoder(column, column_id, nan_idxs)
 
         # Update features types
         features_types = copied_data.supplementary_data.column_types['features']
@@ -117,22 +103,16 @@ def _train_encoder(self, column: pd.Series, column_id: int):
         # Store fitted label encoder for transform method
         self.binary_encoders.update({column_id: encoder})
 
-    def _apply_encoder(self, column: pd.Series, column_id: int, has_nan: pd.Series) -> np.ndarray:
+    def _apply_encoder(self, column: np.ndarray, column_id: int, nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray:
         """ Apply already fitted encoders """
         encoder = self.binary_encoders[column_id]
         # Extend encoder classes if the column contains categories not previously encountered
         encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))
 
         converted = encoder.transform(column)
-        if len(has_nan) > 0:
+        if len(nan_idxs[0]):
             # Column has nans in its structure - after conversion replace it
             converted = converted.astype(float)
-            converted[has_nan] = np.nan
+            converted[nan_idxs] = np.nan
 
         return converted
-
-
-def replace_nans_with_fedot_nans(column: pd.Series, has_nan: pd.Series) -> Tuple[pd.Series, pd.Series]:
-    # Add new category - 'fedot_nan' after converting it will be replaced by nans
-    column[has_nan] = FEDOT_STR_NAN
-    return column, has_nan
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 1082ea43e9..0298d456e2 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -217,8 +217,8 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
             data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
 
             data.supplementary_data.column_types['features'] = [
-                col_type
-                for col_id, col_type in enumerate(data.supplementary_data.column_types['features'])
+                col_type_id
+                for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features'])
                 if col_id not in self.string_columns_transformation_failed
             ]
 
@@ -429,9 +429,7 @@ def define_column_types(table: np.ndarray):
     table_of_types[nans] = TYPE_TO_ID[type(None)]
 
     columns_info = {}
-    for column_id in range(n_columns):
-        col_types = table_of_types[:, column_id]
-
+    for column_id, col_types in enumerate(table_of_types.T):
         unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True)
 
         if len(unique_col_types) > 1:
@@ -445,7 +443,7 @@ def define_column_types(table: np.ndarray):
             ]
 
             # Store information about nans in the target
-            nan_ids = np.where(nans[:, column_id])[0]
+            nan_ids = np.nonzero(nans[:, column_id])[0]
             columns_info.update({column_id: {'types': unique_col_types,
                                              'str_number': str_number,
                                              'int_number': int_number,

From 08712b3d5c22e4a0d8614cda9648566d86a9d344 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 21 Feb 2023 20:01:27 +0300
Subject: [PATCH 23/72] set psutil req with the one from golem

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c23f18a05d..4e2e072802 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,7 +32,7 @@ joblib>=0.17.0
 requests>=2.0
 tqdm
 typing>=3.7.0
-psutil>=5.7.3
+psutil>=5.9.2
 
 # Tests
 pytest>=6.2.0

From 571157cb2f90c5c29538af5a003abc38287f5d7d Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 28 Feb 2023 11:19:31 +0300
Subject: [PATCH 24/72] bug fix

---
 fedot/core/data/data_preprocessing.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 10108e39bd..c8de456492 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
 
 def replace_inf_with_nans(input_data: InputData):
     features = input_data.features
-    inf_idxs: Tuple[np.ndarray, ...] = ((features == np.inf) | (features == -np.inf)).nonzero()
-    if len(inf_idxs[0]):
-        features[inf_idxs] = np.nan
+    has_infs = ((features == np.inf) | (features == -np.inf))
+    if np.any(has_infs):
+        features[has_infs] = np.nan
 
 
 def replace_nans_with_empty_strings(input_data: InputData):

From b2e5f82235f95e5bfdb75a84f6df8a9cf4048883 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 28 Feb 2023 12:31:37 +0300
Subject: [PATCH 25/72] nan to num optimization

---
 .../models/discriminant_analysis.py               | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py
index 9a61c16eca..317e3d41a4 100644
--- a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py
+++ b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py
@@ -29,7 +29,7 @@ def predict(self, input_data):
         """
         prediction = self.model.predict(input_data.features)
 
-        prediction = nan_to_num(prediction)
+        prediction = np.nan_to_num(prediction)
 
         return prediction
 
@@ -40,7 +40,7 @@ def predict_proba(self, input_data):
         """
         prediction = self.model.predict_proba(input_data.features)
 
-        prediction = nan_to_num(prediction)
+        prediction = np.nan_to_num(prediction)
 
         return prediction
 
@@ -93,14 +93,3 @@ class QDAImplementation(DiscriminantAnalysisImplementation):
     def __init__(self, params: Optional[OperationParameters] = None):
         super().__init__(params)
         self.model = QuadraticDiscriminantAnalysis(**self.params.to_dict())
-
-
-def nan_to_num(prediction):
-    """ Function converts nan values to numerical
-
-    :return prediction: prediction without nans
-    """
-    if np.array([pd.isna(_) for _ in prediction]).any():
-        prediction = np.nan_to_num(prediction)
-
-    return prediction

From 4d42edb01fbffa6caebab6f8deaca434c84db585 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 28 Feb 2023 17:35:28 +0300
Subject: [PATCH 26/72] optimized cat features transform

---
 fedot/preprocessing/data_types.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 0298d456e2..b613595619 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -307,25 +307,24 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         Perform automated categorical features determination. If feature column
         contains int or float values with few unique values (less than 13)
         """
-        _, n_cols = data.features.shape
-        for column_id in range(n_cols):
+        for column_id, column in enumerate(data.features.T):
             # For every int/float column perform check
             column_type = data.supplementary_data.column_types['features'][column_id]
             if column_type in [TYPE_TO_ID[int], TYPE_TO_ID[float]]:
-                numerical_column = pd.Series(data.features[:, column_id])
+                pd_column = pd.Series(column)
 
                 # Calculate number of unique values except nans
-                unique_numbers = len(numerical_column.dropna().unique())
+                unique_numbers = len(pd_column.dropna().unique())
 
                 if 2 < unique_numbers < self.categorical_max_uniques_th:
                     # Column need to be transformed into categorical (string) one
                     self.numerical_into_str.append(column_id)
 
                     # Convert into string
-                    converted_array = convert_num_column_into_string_array(numerical_column)
+                    converted_array = convert_num_column_into_string_array(pd_column)
 
-                    # Store converted column into features table
-                    data.features[:, column_id] = converted_array
+                    # Store converted column into feature column
+                    column[:] = converted_array
 
                     # Update information about column types (in-place)
                     features_types = data.supplementary_data.column_types['features']
@@ -337,13 +336,14 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
             # There is no transformation for current table
             return data
 
-        _, n_cols = data.features.shape
-        for column_id in range(n_cols):
+        for column_id, column in enumerate(data.features.T):
             if column_id in self.numerical_into_str:
-                numerical_column = pd.Series(data.features[:, column_id])
+                pd_column = pd.Series(column)
                 # Column must be converted into categorical
-                converted_array = convert_num_column_into_string_array(numerical_column)
-                data.features[:, column_id] = converted_array
+                converted_array = convert_num_column_into_string_array(pd_column)
+
+                # Store converted column into feature column
+                column[:] = converted_array
 
                 # Update information about column types (in-place)
                 features_types = data.supplementary_data.column_types['features']

From 8f96d2d47a291ef5c98a5c540b9da05a07bb3a75 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 30 Mar 2023 15:59:08 +0300
Subject: [PATCH 27/72] rid of for loops (v1)

---
 .../data_operations/categorical_encoders.py   |  42 +++---
 fedot/preprocessing/categorical.py            |  68 ++++-----
 fedot/preprocessing/data_types.py             | 133 ++++++++----------
 3 files changed, 112 insertions(+), 131 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 27607c00f2..b2c6d6f3c7 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -140,13 +140,8 @@ def transform(self, input_data: InputData) -> OutputData:
         """
         copied_data = deepcopy(input_data)
         if self.categorical_ids:
-            # If categorical features are exists - transform them inplace in InputData
-            for categorical_id in self.categorical_ids:
-                categorical_column = input_data.features[:, categorical_id]
-                nan_idxs: Tuple[np.ndarray, ...] = pd.isna(categorical_column).nonzero()
-
-                transformed = self._apply_label_encoder(categorical_column, categorical_id, nan_idxs)
-                copied_data.features[:, categorical_id] = transformed
+            # If categorical features exists - transform them inplace in InputData
+            self._apply_label_encoder(copied_data.features)
 
         output_data = self._convert_to_output(copied_data,
                                               copied_data.features)
@@ -173,24 +168,25 @@ def _fit_label_encoders(self, input_data: InputData):
 
             self.encoders.update({categorical_id: le})
 
-    def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int,
-                             nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray:
-        """ Apply fitted LabelEncoder for column transformation
-
-        :param categorical_column: numpy array with categorical features
-        :param categorical_id: index of current categorical column
-        :param nan_idxs: indices of gap elements in the ``categorical_column``
+    def _apply_label_encoder(self, data: np.ndarray):
         """
-        column_encoder = self.encoders[categorical_id]
-        column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column)))
+        Applies fitted LabelEncoder for all categorical features inplace
 
-        transformed_column = column_encoder.transform(categorical_column)
-        if len(nan_idxs[0]):
-            # Store np.nan values
-            transformed_column = transformed_column.astype(object)
-            transformed_column[nan_idxs] = np.nan
-
-        return transformed_column
+        Args:
+            data: numpy array with all features
+        """
+        categorical_columns = data[:, self.categorical_ids]
+        for column_id, column in zip(self.categorical_ids, categorical_columns.T):
+            column_encoder = self.encoders[column_id]
+            column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, column)))
+
+            transformed_column = column_encoder.transform(column)
+            nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
+            if len(nan_idxs[0]):
+                # Store np.nan values
+                transformed_column = transformed_column.astype(object)
+                transformed_column[nan_idxs] = np.nan
+            data[:, column_id] = transformed_column
 
     def get_params(self) -> OperationParameters:
         """ Due to LabelEncoder has no parameters - return empty set """
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index fe5fa1eeaf..7c01f4b674 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -33,25 +33,24 @@ def fit(self, input_data: InputData):
             return self
 
         binary_ids_to_convert = []
-        for column_id, column in enumerate(input_data.features.T):
-            pd_column = pd.Series(column, copy=True)
+        for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T):
+            pd_column = pd.Series(column, name=column_id, copy=True)
             is_nan = pd_column.isna()
-            column_uniques = pd_column.unique()
-            if is_nan.sum() and column_id in categorical_ids:
+            column_nuniques = pd_column.nunique(False)
+            if is_nan.sum():
                 # This categorical column has nans
                 pd_column[is_nan] = FEDOT_STR_NAN
 
-                if len(column_uniques) <= 3:
+                if column_nuniques <= 3:
                     # There is column with binary categories and gaps
                     self.binary_features_with_nans.append(column_id)
                     binary_ids_to_convert.append(column_id)
-                    self._train_encoder(pd_column, column_id)
-            else:
-                if len(column_uniques) <= 2 and column_id in categorical_ids:
-                    # Column contains binary string feature
-                    binary_ids_to_convert.append(column_id)
-                    # Train encoder for current column
-                    self._train_encoder(pd_column, column_id)
+                    self._train_encoder(pd_column)
+            elif column_nuniques <= 2:
+                # Column contains binary string feature
+                binary_ids_to_convert.append(column_id)
+                # Train encoder for current column
+                self._train_encoder(pd_column)
 
         self.binary_ids_to_convert = binary_ids_to_convert
         return self
@@ -65,14 +64,7 @@ def transform(self, input_data: InputData) -> InputData:
             return input_data
 
         copied_data = deepcopy(input_data)
-        for column_id, column in enumerate(copied_data.features.T):
-            if column_id in self.binary_ids_to_convert:
-                # If column contains nans - replace them with fedot nans special string
-                nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
-                column[nan_idxs] = FEDOT_STR_NAN
-
-                # Convert into integers
-                column[:] = self._apply_encoder(column, column_id, nan_idxs)
+        self._apply_encoder(copied_data.features)
 
         # Update features types
         features_types = copied_data.supplementary_data.column_types['features']
@@ -93,7 +85,7 @@ def fit_transform(self, input_data: InputData) -> InputData:
         self.fit(input_data)
         return self.transform(input_data)
 
-    def _train_encoder(self, column: pd.Series, column_id: int):
+    def _train_encoder(self, column: pd.Series):
         """ Convert labels in the column from string into int via Label encoding.
         So, Label encoder is fitted to do such transformation.
         """
@@ -101,18 +93,26 @@ def _train_encoder(self, column: pd.Series, column_id: int):
         encoder.fit(column)
 
         # Store fitted label encoder for transform method
-        self.binary_encoders.update({column_id: encoder})
-
-    def _apply_encoder(self, column: np.ndarray, column_id: int, nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray:
-        """ Apply already fitted encoders """
-        encoder = self.binary_encoders[column_id]
-        # Extend encoder classes if the column contains categories not previously encountered
-        encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))
+        self.binary_encoders.update({column.name: encoder})
 
-        converted = encoder.transform(column)
-        if len(nan_idxs[0]):
-            # Column has nans in its structure - after conversion replace it
-            converted = converted.astype(float)
-            converted[nan_idxs] = np.nan
+    def _apply_encoder(self, data: np.ndarray):
+        """
+        Applies already fitted encoders to all binary features inplace
 
-        return converted
+        Args:
+            data: numpy array with all features
+        """
+        binary_columns = data[:, self.binary_ids_to_convert]
+        for column_id, column in zip(self.binary_ids_to_convert, binary_columns.T):
+            encoder = self.binary_encoders[column_id]
+            nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
+            column[nan_idxs] = FEDOT_STR_NAN
+            # Extend encoder classes if the column contains categories not previously encountered
+            encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))
+
+            converted = encoder.transform(column)
+            if len(nan_idxs[0]):
+                # Column has nans in its structure - after conversion replace it
+                converted = converted.astype(float)
+                converted[nan_idxs] = np.nan
+            data[:, column_id] = converted
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index b613595619..44a86d4fab 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -142,7 +142,7 @@ def features_types_converting(self, features: np.ndarray) -> np.array:
         for mixed_column_id in features_with_mixed_types:
             column_info = self.features_columns_info[mixed_column_id]
 
-            if column_info.get('str_number') > 0 or column_info.get('float_number') > 0:
+            if column_info.get('str_number') or column_info.get('float_number'):
                 # There are string elements in the array
                 mixed_column = features[:, mixed_column_id]
                 updated_column, new_type_name = self._convert_feature_into_one_type(mixed_column, column_info,
@@ -170,7 +170,7 @@ def target_types_converting(self, target: np.ndarray, task: Task) -> np.array:
         for mixed_column_id in target_with_mixed_types:
             column_info = self.target_columns_info[mixed_column_id]
 
-            if column_info.get('str_number') > 0:
+            if column_info.get('str_number'):
                 # There are string elements in the array
                 mixed_column = target[:, mixed_column_id]
                 updated_column, new_type_name = self._convert_target_into_one_type(mixed_column, column_info,
@@ -307,28 +307,21 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         Perform automated categorical features determination. If feature column
         contains int or float values with few unique values (less than 13)
         """
-        for column_id, column in enumerate(data.features.T):
-            # For every int/float column perform check
-            column_type = data.supplementary_data.column_types['features'][column_id]
-            if column_type in [TYPE_TO_ID[int], TYPE_TO_ID[float]]:
-                pd_column = pd.Series(column)
-
-                # Calculate number of unique values except nans
-                unique_numbers = len(pd_column.dropna().unique())
-
-                if 2 < unique_numbers < self.categorical_max_uniques_th:
-                    # Column need to be transformed into categorical (string) one
-                    self.numerical_into_str.append(column_id)
-
-                    # Convert into string
-                    converted_array = convert_num_column_into_string_array(pd_column)
-
-                    # Store converted column into feature column
-                    column[:] = converted_array
-
-                    # Update information about column types (in-place)
-                    features_types = data.supplementary_data.column_types['features']
-                    features_types[column_id] = TYPE_TO_ID[str]
+        features_types = data.supplementary_data.column_types['features']
+        is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
+        numeric_type_ids = np.nonzero(is_numeric_type)[0]
+        num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
+        nuniques = num_df.nunique(dropna=True)
+        # reduce dataframe to include only categorical features
+        num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)]
+        cat_col_ids = num_df.columns
+        # Convert into string
+        data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy()
+        # Columns need to be transformed into categorical (string) ones
+        self.numerical_into_str.extend(cat_col_ids)
+        for column_id in cat_col_ids:
+            # Update information about column types (in-place)
+            features_types[column_id] = TYPE_TO_ID[str]
 
     def _into_categorical_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into categorical string column for every signed column """
@@ -336,18 +329,16 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
             # There is no transformation for current table
             return data
 
-        for column_id, column in enumerate(data.features.T):
-            if column_id in self.numerical_into_str:
-                pd_column = pd.Series(column)
-                # Column must be converted into categorical
-                converted_array = convert_num_column_into_string_array(pd_column)
-
-                # Store converted column into feature column
-                column[:] = converted_array
-
-                # Update information about column types (in-place)
-                features_types = data.supplementary_data.column_types['features']
-                features_types[column_id] = TYPE_TO_ID[str]
+        # Get numerical columns
+        num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str)
+        
+        # Convert and apply categorical transformation
+        data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
+        
+        # Update information about column types (in-place)
+        features_types = data.supplementary_data.column_types['features']
+        for column_id in self.numerical_into_str:
+            features_types[column_id] = TYPE_TO_ID[str]
 
     def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
@@ -396,7 +387,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
             # There is no transformation for current table
             return data
 
-        n_rows, n_cols = data.features.shape
+        _, n_cols = data.features.shape
         for column_id in range(n_cols):
             if column_id in self.categorical_into_float and column_id not in self.string_columns_transformation_failed:
                 string_column = pd.Series(data.features[:, column_id])
@@ -418,8 +409,7 @@ def define_column_types(table: np.ndarray):
     if table is None:
         return {}
 
-    _, n_columns = table.shape
-
+    #df_of_types = pd.DataFrame(table_of_types).transform()
     nans = pd.isna(table)
     table_of_types = np.empty_like(table, dtype=np.int8)
     table_of_types[~nans] = [
@@ -428,43 +418,39 @@ def define_column_types(table: np.ndarray):
     ]
     table_of_types[nans] = TYPE_TO_ID[type(None)]
 
-    columns_info = {}
-    for column_id, col_types in enumerate(table_of_types.T):
-        unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True)
+    table_of_types = pd.DataFrame(table_of_types)
 
-        if len(unique_col_types) > 1:
-            numbers = [
-                unique_col_types_number[unique_col_types == TYPE_TO_ID[t]]
-                for t in [str, int, float]
-            ]
-            str_number, int_number, float_number = [
-                number.item() if len(number) else 0
-                for number in numbers
-            ]
+    # Build dataframe with unique types for each column
+    uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'})
 
-            # Store information about nans in the target
-            nan_ids = np.nonzero(nans[:, column_id])[0]
-            columns_info.update({column_id: {'types': unique_col_types,
-                                             'str_number': str_number,
-                                             'int_number': int_number,
-                                             'float_number': float_number,
-                                             'nan_number': len(nan_ids),
-                                             'nan_ids': nan_ids}})
-        else:
-            # There is only one type, or several types such as int and float
-            columns_info.update({column_id: {'types': unique_col_types}})
-    return columns_info
+    # Build dataframe with amount of each type
+    counts_index_mapper = {
+        TYPE_TO_ID[str]: 'str_number',
+        TYPE_TO_ID[int]: 'int_number',
+        TYPE_TO_ID[float]: 'float_number',
+        TYPE_TO_ID[type(None)]: 'nan_number'
+    }
+    types_counts = (
+        table_of_types
+        .apply(pd.value_counts, dropna=False)
+        .reindex(counts_index_mapper.keys(), copy=False)
+        .replace(np.nan, 0)
+        .rename(index=counts_index_mapper, copy=False)
+        .astype(int)
+    )
+
+    # Build dataframe with nans indices
+    nans_ids = pd.DataFrame(nans).apply(np.where).rename(index={0: 'nan_ids'})
+    return pd.concat([uniques, types_counts, nans_ids]).to_dict()
 
 
 def find_mixed_types_columns(columns_info: dict):
     """ Search for columns with several types in them """
-    columns_with_mixed_types = []
-    for column_id, information in columns_info.items():
-        column_types = information['types']
-        if len(column_types) > 1:
-            columns_with_mixed_types.append(column_id)
-
-    return columns_with_mixed_types
+    return [
+        col_id 
+        for col_id, col_info in columns_info.items()
+        if len(col_info['types']) > 1
+    ]
 
 
 def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter):
@@ -510,9 +496,8 @@ def _obtain_new_column_type(column_info: dict):
     if column_info['float_number'] > 0 or column_info['nan_number'] > 0:
         # Even if one of types are float - all elements should be converted into float
         return float
-    else:
-        # It is available to convert numerical into integer type
-        return int
+    # It is available to convert numerical into integer type
+    return int
 
 
 def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray,
@@ -549,7 +534,7 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict)
             filtered_types = [x for x in column_types_ids if x != TYPE_TO_ID[type(None)]]
             updated_column_types.append(filtered_types[0])
         else:
-            if any(column_type_id == TYPE_TO_ID[str] for column_type_id in column_types_ids):
+            if TYPE_TO_ID[str] in column_types_ids:
                 # Mixed-types column with string
                 new_column_type = converted_columns[column_id]
                 if new_column_type != -1:

From 7568b2bfb857b50db931a462fbd49e4f39783167 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 18 Apr 2023 17:39:12 +0300
Subject: [PATCH 28/72] compound names fix

---
 fedot/core/data/data_preprocessing.py         | 23 ++++++++++---------
 .../data_operations/categorical_encoders.py   |  8 +++----
 .../sklearn_transformations.py                |  4 ++--
 fedot/preprocessing/categorical.py            |  6 ++---
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index c8de456492..bb060047ce 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -1,4 +1,4 @@
-from typing import Tuple, Optional
+from typing import Tuple, Optional, List
 
 import numpy as np
 import pandas as pd
@@ -55,28 +55,29 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li
         # Only categorical
         categorical_input = input_data.subset_features(categorical_ids)
         return None, categorical_input
-
     else:
         prefix = 'InputData contains no categorical and no numerical features.'
         raise ValueError(f'{prefix} Check data for Nans and inf values')
 
 
-def find_categorical_columns(table: np.ndarray, column_types: dict = None):
+def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[List[int]] = None):
     """
     Method for finding categorical and non-categorical columns in tabular data
 
-    :param table: tabular data for string columns types determination
-    :param column_types: list with column types. If None, perform default checking
-    :return categorical_ids: indices of categorical columns in table
-    :return non_categorical_ids: indices of non categorical columns in table
+    Args:
+        table: tabular data for string columns types determination.
+        column_type_ids: list with column types. If None, perform default checking.
+    Returns:
+        categorical_ids: indices of categorical columns in table.
+        non_categorical_ids: indices of non categorical columns in table.
     """
-    if column_types is None:
+    if column_type_ids is None:
         # Define if data contains string columns for "unknown table"
         return force_categorical_determination(table)
 
     categorical_ids = []
     non_categorical_ids = []
-    for col_id, col_type_id in enumerate(column_types):
+    for col_id, col_type_id in enumerate(column_type_ids):
         if col_type_id == TYPE_TO_ID[str]:
             categorical_ids.append(col_id)
         else:
@@ -113,8 +114,8 @@ def data_has_categorical_features(data: InputData) -> bool:
     if data.data_type is not DataTypesEnum.table:
         return False
 
-    features_types = data.supplementary_data.column_types.get('features')
-    cat_ids, non_cat_ids = find_categorical_columns(data.features, features_types)
+    column_type_ids = data.supplementary_data.column_types.get('features')
+    cat_ids, non_cat_ids = find_categorical_columns(data.features, column_type_ids)
     data_has_categorical_columns = len(cat_ids) > 0
 
     data.numerical_idx = non_cat_ids
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index b2c6d6f3c7..361ccc2eee 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -36,9 +36,9 @@ def fit(self, input_data: InputData):
         :return encoder: trained encoder (optional output)
         """
         features = input_data.features
-        features_types_ids = input_data.supplementary_data.column_types.get('features')
+        column_type_ids = input_data.supplementary_data.column_types.get('features')
         categorical_ids, non_categorical_ids = find_categorical_columns(features,
-                                                                        features_types_ids)
+                                                                        column_type_ids)
 
         # Indices of columns with categorical and non-categorical features
         self.categorical_ids = categorical_ids
@@ -124,9 +124,9 @@ def __init__(self, params: Optional[OperationParameters] = None):
         self.non_categorical_ids = None
 
     def fit(self, input_data: InputData):
-        features_types = input_data.supplementary_data.column_types.get('features')
+        column_type_ids = input_data.supplementary_data.column_types.get('features')
         self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
-                                                                                  features_types)
+                                                                                  column_type_ids)
 
         # If there are categorical features - process it
         if self.categorical_ids:
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index 47f77ba9e0..530eb84320 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -290,9 +290,9 @@ def transform(self, input_data: InputData) -> OutputData:
         replace_inf_with_nans(input_data)
 
         if data_type_is_table(input_data) and data_has_categorical_features(input_data):
-            features_types = input_data.supplementary_data.column_types.get('features')
+            column_type_ids = input_data.supplementary_data.column_types.get('features')
             self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
-                                                                                      features_types)
+                                                                                      column_type_ids)
             numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids,
                                                                        self.non_categorical_ids)
 
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 7c01f4b674..382b927106 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -25,9 +25,9 @@ def fit(self, input_data: InputData):
         Find indices of columns which are contains categorical values. Binary features and at the same time
         has str objects. If there are such features - convert it into int
         """
-        features_types = input_data.supplementary_data.column_types['features']
-        categorical_ids, _ = find_categorical_columns(table=input_data.features,
-                                                      column_types=features_types)
+        column_type_ids = input_data.supplementary_data.column_types['features']
+        categorical_ids, _ = find_categorical_columns(input_data.features,
+                                                      column_type_ids)
         if len(categorical_ids) == 0:
             # There is no need to process categorical features
             return self

From d566947ba8076249e4297b694bdf8bea5a365e48 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 18 Apr 2023 17:42:02 +0300
Subject: [PATCH 29/72] simplified data_preprocessing.py

---
 fedot/core/data/data.py               |  9 ++++++---
 fedot/core/data/data_preprocessing.py | 18 +++---------------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
index 0247638e96..bf06dbf87b 100644
--- a/fedot/core/data/data.py
+++ b/fedot/core/data/data.py
@@ -530,11 +530,14 @@ def subset_indices(self, selected_idx: List):
                          target=self.target[row_nums],
                          task=self.task, data_type=self.data_type)
 
-    def subset_features(self, features_ids: list):
-        """Return new :obj:`InputData` with subset of features based on ``features_ids`` list
+    def subset_features(self, feature_ids: list) -> Optional[InputData]:
         """
+        Return new :obj:`InputData` with subset of features based on non-empty ``features_ids`` list or `None` otherwise
+        """
+        if not feature_ids:
+            return None
 
-        subsample_features = self.features[:, features_ids]
+        subsample_features = self.features[:, feature_ids]
         subsample_input = InputData(features=subsample_features,
                                     data_type=self.data_type,
                                     target=self.target, task=self.task,
diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index bb060047ce..7c35cda649 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -39,22 +39,10 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li
     Split tabular InputData into two parts: with numerical and categorical features
     using list with ids of categorical and numerical features.
     """
-
-    if len(categorical_ids) > 0 and len(non_categorical_ids) > 0:
-        # Both categorical and numerical features
-        numerical_input = input_data.subset_features(non_categorical_ids)
-        categorical_input = input_data.subset_features(categorical_ids)
+    numerical_input = input_data.subset_features(non_categorical_ids)
+    categorical_input = input_data.subset_features(categorical_ids)
+    if not (numerical_input or categorical_input):
         return numerical_input, categorical_input
-
-    elif len(categorical_ids) == 0 and len(non_categorical_ids) > 0:
-        # Only numerical
-        numerical_input = input_data.subset_features(non_categorical_ids)
-        return numerical_input, None
-
-    elif len(categorical_ids) > 0 and len(non_categorical_ids) == 0:
-        # Only categorical
-        categorical_input = input_data.subset_features(categorical_ids)
-        return None, categorical_input
     else:
         prefix = 'InputData contains no categorical and no numerical features.'
         raise ValueError(f'{prefix} Check data for Nans and inf values')

From eb7e28a1a15cf6ee03e416296eea9641247cfbe3 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 19 Apr 2023 17:19:06 +0300
Subject: [PATCH 30/72] data_preprocessing logic fix

---
 fedot/core/data/data_preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 7c35cda649..b241e2cf43 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -41,7 +41,7 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li
     """
     numerical_input = input_data.subset_features(non_categorical_ids)
     categorical_input = input_data.subset_features(categorical_ids)
-    if not (numerical_input or categorical_input):
+    if numerical_input or categorical_input:
         return numerical_input, categorical_input
     else:
         prefix = 'InputData contains no categorical and no numerical features.'

From 2247e0227d741e92f48a3b6a18c3e355c886d875 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 19 Apr 2023 17:57:11 +0300
Subject: [PATCH 31/72] numpy's nonzero to flatzero

---
 .../data_operations/categorical_encoders.py            | 10 +++++-----
 fedot/preprocessing/categorical.py                     |  4 ++--
 fedot/preprocessing/preprocessing.py                   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 361ccc2eee..6213c53a75 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -1,5 +1,5 @@
 from copy import deepcopy
-from typing import Optional, Tuple
+from typing import Optional, List
 
 import numpy as np
 import pandas as pd
@@ -120,8 +120,8 @@ def __init__(self, params: Optional[OperationParameters] = None):
         super().__init__(params)
         # LabelEncoder has no parameters
         self.encoders = {}
-        self.categorical_ids = None
-        self.non_categorical_ids = None
+        self.categorical_ids: List[int] = None
+        self.non_categorical_ids: List[int] = None
 
     def fit(self, input_data: InputData):
         column_type_ids = input_data.supplementary_data.column_types.get('features')
@@ -181,8 +181,8 @@ def _apply_label_encoder(self, data: np.ndarray):
             column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, column)))
 
             transformed_column = column_encoder.transform(column)
-            nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
-            if len(nan_idxs[0]):
+            nan_idxs = np.flatnonzero(pd.isna(column))
+            if len(nan_idxs):
                 # Store np.nan values
                 transformed_column = transformed_column.astype(object)
                 transformed_column[nan_idxs] = np.nan
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 382b927106..750379bff7 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -105,13 +105,13 @@ def _apply_encoder(self, data: np.ndarray):
         binary_columns = data[:, self.binary_ids_to_convert]
         for column_id, column in zip(self.binary_ids_to_convert, binary_columns.T):
             encoder = self.binary_encoders[column_id]
-            nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
+            nan_idxs = np.flatnonzero(pd.isna(column))
             column[nan_idxs] = FEDOT_STR_NAN
             # Extend encoder classes if the column contains categories not previously encountered
             encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))
 
             converted = encoder.transform(column)
-            if len(nan_idxs[0]):
+            if len(nan_idxs):
                 # Column has nans in its structure - after conversion replace it
                 converted = converted.astype(float)
                 converted[nan_idxs] = np.nan
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 04bcf106a5..5186a1b84c 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -269,7 +269,7 @@ def _find_features_lacking_nans(self, data: InputData, source_name: str):
         features = data.features
         axes_except_cols = (0,) + tuple(range(2, features.ndim))
         are_allowed = np.mean(pd.isna(features), axis=axes_except_cols) < ALLOWED_NAN_PERCENT
-        self.ids_relevant_features[source_name] = np.nonzero(are_allowed)[0]
+        self.ids_relevant_features[source_name] = np.flatnonzero(are_allowed)
 
     @staticmethod
     def _drop_rows_with_nan_in_target(data: InputData) -> InputData:

From 9443d2169c694f1230c8fa600ab414d270b133c8 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 19 Apr 2023 17:58:22 +0300
Subject: [PATCH 32/72] simplified data_types.py

---
 fedot/preprocessing/data_types.py | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 44a86d4fab..ec6754f179 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from copy import copy
-from typing import TYPE_CHECKING, Tuple
+from typing import TYPE_CHECKING, Tuple, Optional, Dict
 
 import numpy as np
 import pandas as pd
@@ -128,17 +128,12 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
         table = np.delete(table, self.columns_to_del, 1)
         return table
 
-    def features_types_converting(self, features: np.ndarray) -> np.array:
+    def features_types_converting(self, features: np.ndarray) -> np.ndarray:
         """ Convert all elements in the data in every feature column into one type
 
         :param features: tabular features array
         """
         features_with_mixed_types = find_mixed_types_columns(self.features_columns_info)
-
-        if not features_with_mixed_types:
-            return features
-
-        # There are mixed-types columns in features table - convert them
         for mixed_column_id in features_with_mixed_types:
             column_info = self.features_columns_info[mixed_column_id]
 
@@ -309,7 +304,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         """
         features_types = data.supplementary_data.column_types['features']
         is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
-        numeric_type_ids = np.nonzero(is_numeric_type)[0]
+        numeric_type_ids = np.flatnonzero(is_numeric_type)
         num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
         nuniques = num_df.nunique(dropna=True)
         # reduce dataframe to include only categorical features
@@ -401,7 +396,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
                 features_types[column_id] = TYPE_TO_ID[float]
 
 
-def define_column_types(table: np.ndarray):
+def define_column_types(table: Optional[np.ndarray]) -> Dict:
     """ Prepare information about types per columns. For each column store unique
     types, which column contains. If column with mixed type contain str object
     additional field 'str_ids' with indices of string objects is prepared
@@ -409,16 +404,8 @@ def define_column_types(table: np.ndarray):
     if table is None:
         return {}
 
-    #df_of_types = pd.DataFrame(table_of_types).transform()
-    nans = pd.isna(table)
-    table_of_types = np.empty_like(table, dtype=np.int8)
-    table_of_types[~nans] = [
-        TYPE_TO_ID[type(x.item() if isinstance(x, (np.ndarray, np.generic)) else x)]
-        for x in table[~nans]
-    ]
-    table_of_types[nans] = TYPE_TO_ID[type(None)]
-
-    table_of_types = pd.DataFrame(table_of_types)
+    table_of_types = pd.DataFrame(table, copy=True)
+    table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8)
 
     # Build dataframe with unique types for each column
     uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'})
@@ -440,7 +427,9 @@ def define_column_types(table: np.ndarray):
     )
 
     # Build dataframe with nans indices
-    nans_ids = pd.DataFrame(nans).apply(np.where).rename(index={0: 'nan_ids'})
+    nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: 'nan_ids'})
+
+    # Combine all dataframes
     return pd.concat([uniques, types_counts, nans_ids]).to_dict()
 
 

From adc77b6d678d6f6a6fd217953150a693340a4e40 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 27 Apr 2023 14:07:55 +0300
Subject: [PATCH 33/72] further opts

---
 fedot/preprocessing/data_types.py | 289 ++++++++++++++----------------
 1 file changed, 132 insertions(+), 157 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index ec6754f179..0e4f699c06 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from copy import copy
-from typing import TYPE_CHECKING, Tuple, Optional, Dict
+from typing import TYPE_CHECKING, Tuple, Optional, List, Dict
 
 import numpy as np
 import pandas as pd
@@ -17,6 +17,13 @@
 
 TYPE_TO_ID = dict(zip(_convertable_types, _types_ids))
 
+_TYPES = 'types'
+_FLOAT_NUMBER = 'float_number'
+_INT_NUMBER = 'int_number'
+_STR_NUMBER = 'str_number'
+_NAN_NUMBER = 'nan_number'
+_NAN_IDS = 'nan_ids'
+
 FEDOT_STR_NAN = 'fedot_nan'
 # If unique values in the feature column is less than 13 - convert column into string type else to numerical
 CATEGORICAL_MAX_UNIQUE_TH = 13
@@ -38,8 +45,8 @@ def __init__(self):
         self.acceptable_failed_rate_bottom = ACCEPTABLE_CONVERSION_FAILED_RATE_BOTTOM
         self.acceptable_failed_rate_top = ACCEPTABLE_CONVERSION_FAILED_RATE_TOP
 
-        self.features_columns_info = {}
-        self.target_columns_info = {}
+        self.features_columns_info = pd.DataFrame()
+        self.target_columns_info = pd.DataFrame()
 
         # Dictionary with information about converted during fitting columns
         self.features_converted_columns = {}
@@ -116,15 +123,7 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
         :param table: tabular dataset based on which new dataset will be generated
         :param converted_columns: dictionary with actions with table
         """
-        if not converted_columns:
-            return table
-
-        self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id == -1]
-        if not self.columns_to_del:
-            # There are no columns to delete
-            return table
-
-        # Remove all "bad" columns
+        self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id is None]
         table = np.delete(table, self.columns_to_del, 1)
         return table
 
@@ -133,48 +132,37 @@ def features_types_converting(self, features: np.ndarray) -> np.ndarray:
 
         :param features: tabular features array
         """
-        features_with_mixed_types = find_mixed_types_columns(self.features_columns_info)
-        for mixed_column_id in features_with_mixed_types:
-            column_info = self.features_columns_info[mixed_column_id]
+        mixed_types_columns = _find_mixed_types_columns(self.features_columns_info)
+        cols_with_strings_or_floats = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER, _FLOAT_NUMBER])
 
-            if column_info.get('str_number') or column_info.get('float_number'):
-                # There are string elements in the array
-                mixed_column = features[:, mixed_column_id]
-                updated_column, new_type_name = self._convert_feature_into_one_type(mixed_column, column_info,
-                                                                                    mixed_column_id)
-                # Store information about converted columns
-                self.features_converted_columns.update({mixed_column_id: new_type_name})
+        def _update_converted_columns_and_data(column_info: pd.Series):
+            updated_column, new_type_id = self._convert_feature_into_one_type(features[:, column_info.name],
+                                                                              column_info)
+            self.features_converted_columns[column_info.name] = new_type_id
+            if updated_column is not None:
+                features[:, column_info.name] = updated_column
 
-                if updated_column is not None:
-                    features[:, mixed_column_id] = updated_column
+        cols_with_strings_or_floats.apply(_update_converted_columns_and_data)
 
         return features
 
-    def target_types_converting(self, target: np.ndarray, task: Task) -> np.array:
+    def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray:
         """ Convert all elements in every target column into one type
 
         :param target: tabular target array
         :param task: task to solve
         """
-        target_with_mixed_types = find_mixed_types_columns(self.target_columns_info)
-
-        if not target_with_mixed_types:
-            return target
-
-        # There are mixed-types columns in features table - convert them
-        for mixed_column_id in target_with_mixed_types:
-            column_info = self.target_columns_info[mixed_column_id]
+        mixed_types_columns = _find_mixed_types_columns(self.target_columns_info)
+        cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER])
 
-            if column_info.get('str_number'):
-                # There are string elements in the array
-                mixed_column = target[:, mixed_column_id]
-                updated_column, new_type_name = self._convert_target_into_one_type(mixed_column, column_info,
-                                                                                   mixed_column_id, task)
-                # Store information about converted columns
-                self.target_converted_columns.update({mixed_column_id: new_type_name})
+        def _update_converted_columns_and_data(column_info: pd.Series):
+            updated_column, new_type_id = self._convert_target_into_one_type(target[:, column_info.name], column_info,
+                                                                             task)
+            self.target_converted_columns[column_info.name] = new_type_id
+            if updated_column is not None:
+                target[:, column_info.name] = updated_column
 
-                if updated_column is not None:
-                    target[:, mixed_column_id] = updated_column
+        cols_with_strings.apply(_update_converted_columns_and_data)
 
         return target
 
@@ -183,11 +171,11 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray =
         """ Prepare information about columns in a form of dictionary
         Dictionary has two keys: 'target' and 'features'
         """
-        if not self.features_columns_info:
+        if self.features_columns_info.empty:
             # Information about column types is empty - there is a need to launch algorithm to collect info
             self.features_columns_info = define_column_types(predictors)
             predictors = self.features_types_converting(features=predictors)
-        if not self.target_columns_info and task.task_type is not TaskTypesEnum.ts_forecasting:
+        if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting:
             self.target_columns_info = define_column_types(target)
             target = self.target_types_converting(target=target, task=task)
 
@@ -205,7 +193,7 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
         """ Update information in supplementary info - retain info only about remained columns.
         Such columns have no conflicts with types converting.
         """
-        if len(self.string_columns_transformation_failed) > 0:
+        if self.string_columns_transformation_failed:
             self.log.warning(f'Columns with indices {list(self.string_columns_transformation_failed.keys())} were '
                              f'removed during mixed types column converting due to conflicts.')
 
@@ -225,32 +213,32 @@ def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list):
             self.log.warning('Columns number and types numbers do not match.')
 
     @staticmethod
-    def _remove_pseudo_str_values_from_str_column(data: pd.DataFrame, column_id: int):
+    def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index):
         """ Removes from truly str column all pseudo str values """
-        cur_column = data.features[:, column_id]
-        converted_column = []
-        for i in range(len(cur_column)):
-            try:
-                float(cur_column[i])
-                converted_column.append(np.nan)
-            except ValueError:
-                converted_column.append(cur_column[i])
-        data.features[:, column_id] = pd.Series(converted_column).values
-
-    def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int):
+        for col_id in columns:
+            for row_id, item in enumerate(data.features[:, col_id]):
+                try:
+                    float(item)
+                except ValueError:
+                    continue
+                else:
+                    # item is numeric, remove its value
+                    data.features[row_id, col_id] = np.nan
+
+    def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series):
         """ Determine new type for current feature column based on the string ratio. And then convert column into it.
 
         :param mixed_column: one-dimensional array with several data types
         :param column_info: dictionary with information about types in the column
         :param mixed_column_id: index of column in dataset
         """
-        if len(column_info['types']) == 2 and TYPE_TO_ID[type(None)] in column_info['types']:
+        if len(column_info[_TYPES]) == 2 and TYPE_TO_ID[type(None)] in column_info[_TYPES]:
             # Column contain only one data type and nans
-            filtered_types = [x for x in column_info['types'] if x != TYPE_TO_ID[type(None)]]
+            filtered_types = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]]
             return mixed_column, filtered_types[0]
 
-        string_objects_number = column_info['str_number']
-        all_elements_number = string_objects_number + column_info['int_number'] + column_info['float_number']
+        string_objects_number = column_info[_STR_NUMBER]
+        all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum()
         string_ratio = string_objects_number / all_elements_number
 
         if string_ratio > 0:
@@ -261,19 +249,19 @@ def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info:
         try:
             mixed_column = mixed_column.astype(suggested_type)
             # If there were nans in the column - paste nan
-            if column_info['nan_number'] > 0:
+            if column_info[_NAN_NUMBER]:
                 mixed_column = mixed_column.astype(object)
-                mixed_column[column_info['nan_ids']] = np.nan
-                del column_info['nan_ids']
+                mixed_column[column_info[_NAN_IDS]] = np.nan
+                del column_info[_NAN_IDS]
             return mixed_column, TYPE_TO_ID[suggested_type]
         except ValueError:
             # Cannot convert string objects into int or float (for example 'a' into int)
-            prefix = f'Feature column with index {mixed_column_id} contains ' \
-                     f'following data types: {column_info["types"]}.'
+            prefix = f'Feature column with index {column_info.name} contains ' \
+                     f'following data types: {column_info[_TYPES]}.'
             self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
-            return None, -1
+            return None, None
 
-    def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: dict, mixed_column_id: int,
+    def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series,
                                       task: Task) -> Tuple[np.ndarray, str]:
         """ Convert target columns into one type based on column proportions of object and task """
         if task.task_type is TaskTypesEnum.classification:
@@ -290,8 +278,8 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: d
             target_column = pd.Series(mixed_column)
             converted_column = pd.to_numeric(target_column, errors='coerce')
 
-            prefix = f'Target column with index {mixed_column_id} contains ' \
-                     f'following data types: {column_info["types"]}.'
+            prefix = (f'Target column with index {column_info.name} contains '
+                      f'following data types: {column_info[_TYPES]}.')
             log_message = f'{prefix} String cannot be converted into {suggested_type}. Ignore non converted values.'
             self.log.debug(log_message)
             self.target_converting_has_errors = True
@@ -313,7 +301,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         # Convert into string
         data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy()
         # Columns need to be transformed into categorical (string) ones
-        self.numerical_into_str.extend(cat_col_ids)
+        self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str))
         for column_id in cat_col_ids:
             # Update information about column types (in-place)
             features_types[column_id] = TYPE_TO_ID[str]
@@ -326,10 +314,10 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
 
         # Get numerical columns
         num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str)
-        
+
         # Convert and apply categorical transformation
         data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
-        
+
         # Update information about column types (in-place)
         features_types = data.supplementary_data.column_types['features']
         for column_id in self.numerical_into_str:
@@ -339,107 +327,94 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
         Automatically determine categorical features which should be converted into float
         """
-        n_rows, n_cols = data.features.shape
-        for column_id in range(n_cols):
-            # For every string column perform converting if necessary
-            column_type = data.supplementary_data.column_types['features'][column_id]
-            if column_type == TYPE_TO_ID[str]:
-                string_column = pd.Series(data.features[:, column_id])
-
-                # Number of nans in the column
-                nans_number = string_column.isna().sum()
-
-                # Column probably not an "actually categorical" but a column with an incorrectly defined type
-                converted_column = pd.to_numeric(string_column, errors='coerce')
-                # Calculate applied nans
-                result_nans_number = converted_column.isna().sum()
-                failed_objects_number = result_nans_number - nans_number
-                non_nan_all_objects_number = n_rows - nans_number
-                failed_ratio = failed_objects_number / non_nan_all_objects_number
-
-                # If all objects are truly strings - all objects transform into nan
-                is_column_contain_numerical_objects = failed_ratio != 1
-                if failed_ratio < self.acceptable_failed_rate_bottom:
-                    # The majority of objects can be converted into numerical
-                    data.features[:, column_id] = converted_column.values
-
-                    # Update information about column types (in-place)
-                    self.categorical_into_float.append(column_id)
-                    features_types = data.supplementary_data.column_types['features']
-                    features_types[column_id] = TYPE_TO_ID[float]
-                elif failed_ratio >= self.acceptable_failed_rate_top \
-                        and is_column_contain_numerical_objects:
-                    # The column consists mostly of truly str values and has a few ints/floats in it
-                    self._remove_pseudo_str_values_from_str_column(data, column_id)
-                elif self.acceptable_failed_rate_top > failed_ratio >= self.acceptable_failed_rate_bottom:
-                    # Probably numerical column contains a lot of '?' or 'x' as nans equivalents
-                    # Add columns to remove list
-                    self.string_columns_transformation_failed.update({column_id: -1})
-
-    def _into_numeric_features_transformation_for_predict(self, data: InputData):
-        """ Apply conversion into float string column for every signed column """
-        if not self.categorical_into_float:
-            # There is no transformation for current table
-            return data
+        str_columns = [
+            column_id for column_id, _ in enumerate(data.features.T)
+            if data.supplementary_data.column_types['features'][column_id] == TYPE_TO_ID[str]]
+        str_cols_df = pd.DataFrame(data.features[:, str_columns], columns=str_columns)
+        orig_nans_cnt = str_cols_df.isna().sum(axis=0)
+
+        converted_str_cols_df = str_cols_df.apply(pd.to_numeric, errors='coerce')
+        conv_nans_cnt = converted_str_cols_df.isna().sum(axis=0)
+
+        failed_objects_cnt = conv_nans_cnt - orig_nans_cnt
+        non_nan_all_objects_cnt = len(data.features) - orig_nans_cnt
+        failed_ratio = failed_objects_cnt / non_nan_all_objects_cnt
+
+        # Check if the majority of objects can be converted into numerical
+        is_numeric = failed_ratio < self.acceptable_failed_rate_bottom
+        is_numeric_ids = is_numeric[is_numeric].index
+        data.features[:, is_numeric_ids] = converted_str_cols_df[is_numeric_ids].to_numpy()
+        self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
+        features_types = data.supplementary_data.column_types['features']
+        for column_id in is_numeric_ids:
+            features_types[column_id] = TYPE_TO_ID[float]
 
-        _, n_cols = data.features.shape
-        for column_id in range(n_cols):
-            if column_id in self.categorical_into_float and column_id not in self.string_columns_transformation_failed:
-                string_column = pd.Series(data.features[:, column_id])
+        # The columns consists mostly of truly str values and has a few ints/floats in it
+        is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
+        self._remove_pseudo_str_values_from_str_column(data, is_mixed[is_mixed].index)
 
-                # Column must be converted into float from categorical
-                converted_column = pd.to_numeric(string_column, errors='coerce')
-                data.features[:, column_id] = converted_column.values
+        # If column contains a lot of '?' or 'x' as nans equivalents
+        # add it remove list
+        is_of_mistakes = (
+                (self.acceptable_failed_rate_bottom <= failed_ratio)
+                & (failed_ratio < self.acceptable_failed_rate_top))
+        self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index))
 
-                # Update information about column types (in-place)
-                features_types = data.supplementary_data.column_types['features']
-                features_types[column_id] = TYPE_TO_ID[float]
+    def _into_numeric_features_transformation_for_predict(self, data: InputData):
+        """ Apply conversion into float string column for every signed column """
+        str_cols_ids = list(set(self.categorical_into_float)
+                            .difference(self.string_columns_transformation_failed))
+        str_cols_df = pd.DataFrame(data.features[:, str_cols_ids], columns=str_cols_ids)
+        data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
+        for column_id in str_cols_ids:
+            data.supplementary_data.column_types['features'][column_id] = TYPE_TO_ID[float]
 
 
-def define_column_types(table: Optional[np.ndarray]) -> Dict:
+def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
     """ Prepare information about types per columns. For each column store unique
-    types, which column contains. If column with mixed type contain str object
-    additional field 'str_ids' with indices of string objects is prepared
+    types, which column contains.
     """
     if table is None:
-        return {}
+        return pd.DataFrame()
 
     table_of_types = pd.DataFrame(table, copy=True)
     table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8)
 
     # Build dataframe with unique types for each column
-    uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'})
+    uniques = table_of_types.apply([pd.unique]).rename(index={'unique': _TYPES})
 
     # Build dataframe with amount of each type
     counts_index_mapper = {
-        TYPE_TO_ID[str]: 'str_number',
-        TYPE_TO_ID[int]: 'int_number',
-        TYPE_TO_ID[float]: 'float_number',
-        TYPE_TO_ID[type(None)]: 'nan_number'
+        TYPE_TO_ID[float]: _FLOAT_NUMBER,
+        TYPE_TO_ID[int]: _INT_NUMBER,
+        TYPE_TO_ID[str]: _STR_NUMBER,
+        TYPE_TO_ID[type(None)]: _NAN_NUMBER
     }
     types_counts = (
         table_of_types
-        .apply(pd.value_counts, dropna=False)
-        .reindex(counts_index_mapper.keys(), copy=False)
-        .replace(np.nan, 0)
-        .rename(index=counts_index_mapper, copy=False)
-        .astype(int)
+            .apply(pd.value_counts, dropna=False)
+            .reindex(counts_index_mapper.keys(), copy=False)
+            .replace(np.nan, 0)
+            .rename(index=counts_index_mapper, copy=False)
+            .astype(int)
     )
 
     # Build dataframe with nans indices
-    nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: 'nan_ids'})
+    nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: _NAN_IDS})
 
     # Combine all dataframes
-    return pd.concat([uniques, types_counts, nans_ids]).to_dict()
+    return pd.concat([uniques, types_counts, nans_ids])
 
 
-def find_mixed_types_columns(columns_info: dict):
+def _find_mixed_types_columns(columns_info: pd.DataFrame) -> pd.DataFrame:
     """ Search for columns with several types in them """
-    return [
-        col_id 
-        for col_id, col_info in columns_info.items()
-        if len(col_info['types']) > 1
-    ]
+    has_mixed_types = [] if columns_info.empty else columns_info.loc[_TYPES].apply(len) > 1
+    return columns_info.loc[:, has_mixed_types]
+
+
+def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> pd.DataFrame:
+    _cols_have_any = [] if frame.empty else frame.loc[rows_to_select].any()
+    return frame.loc[:, _cols_have_any]
 
 
 def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter):
@@ -462,7 +437,7 @@ def type_by_id(current_type_id: int):
         # Occurs if for predict stage there is no target info
         return None
 
-    n_rows, n_cols = table.shape
+    _, n_cols = table.shape
     for column_id in range(n_cols):
         current_column = table[:, column_id]
         current_type = type_by_id(column_types[column_id])
@@ -472,17 +447,17 @@ def type_by_id(current_type_id: int):
     return table
 
 
-def convert_num_column_into_string_array(numerical_column: pd.Series) -> np.array:
+def convert_num_column_into_string_array(numerical_column: pd.Series) -> pd.Series:
     """ Convert pandas column into numpy one-dimensional array """
     # convert only non-nans values
     true_nums = numerical_column[numerical_column.notna()]
     numerical_column[true_nums.index] = true_nums.astype(str, copy=False)
-    return numerical_column.to_numpy()
+    return numerical_column
 
 
-def _obtain_new_column_type(column_info: dict):
+def _obtain_new_column_type(column_info: pd.Series):
     """ Suggest in or float type based on the presence of nan and float values """
-    if column_info['float_number'] > 0 or column_info['nan_number'] > 0:
+    if column_info[[_FLOAT_NUMBER, _NAN_NUMBER]].any():
         # Even if one of types are float - all elements should be converted into float
         return float
     # It is available to convert numerical into integer type
@@ -505,7 +480,7 @@ def _convert_predict_column_into_desired_type(table: np.ndarray, current_column:
                                                                         current_type=current_type)
 
 
-def _generate_list_with_types(columns_types_info: dict, converted_columns: dict) -> list:
+def _generate_list_with_types(columns_types_info: pd.DataFrame, converted_columns: Dict[int, Optional[int]]) -> list:
     """ Create list with types for all remained columns
 
     :param columns_types_info: dictionary with initial column types
@@ -513,7 +488,7 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict)
     """
     updated_column_types = []
     for column_id, column_info in columns_types_info.items():
-        column_types_ids = column_info['types']
+        column_types_ids = column_info[_TYPES]
 
         if len(column_types_ids) == 1:
             # Column initially contain only one type
@@ -525,9 +500,9 @@ def _generate_list_with_types(columns_types_info: dict, converted_columns: dict)
         else:
             if TYPE_TO_ID[str] in column_types_ids:
                 # Mixed-types column with string
-                new_column_type = converted_columns[column_id]
-                if new_column_type != -1:
-                    updated_column_types.append(new_column_type)
+                new_col_id = converted_columns[column_id]
+                if new_col_id is not None:
+                    updated_column_types.append(new_col_id)
             else:
                 # Mixed-types with float and integer
                 updated_column_types.append(TYPE_TO_ID[float])

From 55701809f086f1f9b84b0758efb066fecba8764f Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Thu, 27 Apr 2023 17:42:08 +0300
Subject: [PATCH 34/72] cats ids via numpy

---
 fedot/api/api_utils/input_analyser.py         | 13 +---
 fedot/core/data/data_preprocessing.py         | 14 ++---
 .../data_operations/categorical_encoders.py   |  7 +--
 fedot/preprocessing/categorical.py            |  4 --
 fedot/preprocessing/data_types.py             | 59 ++++++++++---------
 5 files changed, 41 insertions(+), 56 deletions(-)

diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py
index 1c524320a3..3b9c5f2c23 100644
--- a/fedot/api/api_utils/input_analyser.py
+++ b/fedot/api/api_utils/input_analyser.py
@@ -1,9 +1,8 @@
 from functools import partial
 from inspect import signature
-
-import numpy as np
 from typing import Dict, Tuple, Any, Union
 
+import numpy as np
 from golem.core.log import default_log
 
 from fedot.core.composer.meta_rules import get_cv_folds_number, get_recommended_preset, \
@@ -118,11 +117,5 @@ def control_categorical(self, input_data: InputData) -> bool:
         """
 
         categorical_ids, _ = find_categorical_columns(input_data.features)
-        all_cardinality = 0
-        need_label = False
-        for idx in categorical_ids:
-            all_cardinality += np.unique(input_data.features[:, idx].astype(str)).shape[0]
-            if all_cardinality > self.max_cat_cardinality:
-                need_label = True
-                break
-        return need_label
+        uniques = np.unique(input_data.features[:, categorical_ids].astype(str))
+        return len(uniques) > self.max_cat_cardinality
diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index b241e2cf43..e5565204e6 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -1,4 +1,4 @@
-from typing import Tuple, Optional, List
+from typing import Tuple, Optional
 
 import numpy as np
 import pandas as pd
@@ -48,7 +48,7 @@ def divide_data_categorical_numerical(input_data: InputData, categorical_ids: li
         raise ValueError(f'{prefix} Check data for Nans and inf values')
 
 
-def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[List[int]] = None):
+def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.ndarray] = None):
     """
     Method for finding categorical and non-categorical columns in tabular data
 
@@ -63,13 +63,9 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[List[i
         # Define if data contains string columns for "unknown table"
         return force_categorical_determination(table)
 
-    categorical_ids = []
-    non_categorical_ids = []
-    for col_id, col_type_id in enumerate(column_type_ids):
-        if col_type_id == TYPE_TO_ID[str]:
-            categorical_ids.append(col_id)
-        else:
-            non_categorical_ids.append(col_id)
+    is_str = np.isin(column_type_ids, TYPE_TO_ID[str])
+    categorical_ids = np.flatnonzero(is_str).tolist()
+    non_categorical_ids = np.flatnonzero(~is_str).tolist()
 
     return categorical_ids, non_categorical_ids
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 6213c53a75..505d2610db 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -37,12 +37,7 @@ def fit(self, input_data: InputData):
         """
         features = input_data.features
         column_type_ids = input_data.supplementary_data.column_types.get('features')
-        categorical_ids, non_categorical_ids = find_categorical_columns(features,
-                                                                        column_type_ids)
-
-        # Indices of columns with categorical and non-categorical features
-        self.categorical_ids = categorical_ids
-        self.non_categorical_ids = non_categorical_ids
+        self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, column_type_ids)
 
         # If there are categorical features - process it
         if self.categorical_ids:
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 750379bff7..fb2e59ee9e 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -1,5 +1,4 @@
 from copy import deepcopy
-from typing import Tuple
 
 import numpy as np
 import pandas as pd
@@ -28,9 +27,6 @@ def fit(self, input_data: InputData):
         column_type_ids = input_data.supplementary_data.column_types['features']
         categorical_ids, _ = find_categorical_columns(input_data.features,
                                                       column_type_ids)
-        if len(categorical_ids) == 0:
-            # There is no need to process categorical features
-            return self
 
         binary_ids_to_convert = []
         for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T):
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 0e4f699c06..34cd4cecc9 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -194,16 +194,20 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
         Such columns have no conflicts with types converting.
         """
         if self.string_columns_transformation_failed:
-            self.log.warning(f'Columns with indices {list(self.string_columns_transformation_failed.keys())} were '
+            self.log.warning(f'Columns with indices {self.string_columns_transformation_failed} were '
                              f'removed during mixed types column converting due to conflicts.')
 
             data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
 
-            data.supplementary_data.column_types['features'] = [
-                col_type_id
-                for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features'])
-                if col_id not in self.string_columns_transformation_failed
-            ]
+            # data.supplementary_data.column_types['features'] = [
+            #     col_type_id
+            #     for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features'])
+            #     if col_id not in self.string_columns_transformation_failed
+            # ]
+            data.supplementary_data.column_types['features'] = np.delete(
+                data.supplementary_data.column_types['features'],
+                list(self.string_columns_transformation_failed.keys())
+            )
 
     def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list):
         # Check if columns number correct
@@ -302,9 +306,8 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy()
         # Columns need to be transformed into categorical (string) ones
         self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str))
-        for column_id in cat_col_ids:
-            # Update information about column types (in-place)
-            features_types[column_id] = TYPE_TO_ID[str]
+        # Update information about column types (in-place)
+        features_types[cat_col_ids] = TYPE_TO_ID[str]
 
     def _into_categorical_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into categorical string column for every signed column """
@@ -320,16 +323,15 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
 
         # Update information about column types (in-place)
         features_types = data.supplementary_data.column_types['features']
-        for column_id in self.numerical_into_str:
-            features_types[column_id] = TYPE_TO_ID[str]
+        features_types[self.numerical_into_str] = TYPE_TO_ID[str]
 
     def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
         Automatically determine categorical features which should be converted into float
         """
-        str_columns = [
-            column_id for column_id, _ in enumerate(data.features.T)
-            if data.supplementary_data.column_types['features'][column_id] == TYPE_TO_ID[str]]
+        str_columns = np.flatnonzero(
+            np.isin(data.supplementary_data.column_types['features'], TYPE_TO_ID[str])
+        )
         str_cols_df = pd.DataFrame(data.features[:, str_columns], columns=str_columns)
         orig_nans_cnt = str_cols_df.isna().sum(axis=0)
 
@@ -345,9 +347,10 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         is_numeric_ids = is_numeric[is_numeric].index
         data.features[:, is_numeric_ids] = converted_str_cols_df[is_numeric_ids].to_numpy()
         self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
+
+        # Update information about column types (in-place)
         features_types = data.supplementary_data.column_types['features']
-        for column_id in is_numeric_ids:
-            features_types[column_id] = TYPE_TO_ID[float]
+        features_types[is_numeric_ids] = TYPE_TO_ID[float]
 
         # The columns consists mostly of truly str values and has a few ints/floats in it
         is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
@@ -366,8 +369,10 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
                             .difference(self.string_columns_transformation_failed))
         str_cols_df = pd.DataFrame(data.features[:, str_cols_ids], columns=str_cols_ids)
         data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
-        for column_id in str_cols_ids:
-            data.supplementary_data.column_types['features'][column_id] = TYPE_TO_ID[float]
+
+        # Update information about column types (in-place)
+        features_types = data.supplementary_data.column_types['features']
+        features_types[str_cols_ids] = TYPE_TO_ID[float]
 
 
 def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
@@ -480,25 +485,25 @@ def _convert_predict_column_into_desired_type(table: np.ndarray, current_column:
                                                                         current_type=current_type)
 
 
-def _generate_list_with_types(columns_types_info: pd.DataFrame, converted_columns: Dict[int, Optional[int]]) -> list:
+def _generate_list_with_types(columns_types_info: pd.DataFrame,
+                              converted_columns: Dict[int, Optional[int]]) -> np.ndarray:
     """ Create list with types for all remained columns
 
     :param columns_types_info: dictionary with initial column types
     :param converted_columns: dictionary with transformed column types
     """
     updated_column_types = []
-    for column_id, column_info in columns_types_info.items():
-        column_types_ids = column_info[_TYPES]
 
-        if len(column_types_ids) == 1:
+    for column_id, column_type_ids in columns_types_info.loc[_TYPES].items():
+        if len(column_type_ids) == 1:
             # Column initially contain only one type
-            updated_column_types.append(column_types_ids[0])
-        elif len(column_types_ids) == 2 and TYPE_TO_ID[type(None)] in column_types_ids:
+            updated_column_types.append(column_type_ids[0])
+        elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids:
             # Column with one type and nans
-            filtered_types = [x for x in column_types_ids if x != TYPE_TO_ID[type(None)]]
+            filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]]
             updated_column_types.append(filtered_types[0])
         else:
-            if TYPE_TO_ID[str] in column_types_ids:
+            if TYPE_TO_ID[str] in column_type_ids:
                 # Mixed-types column with string
                 new_col_id = converted_columns[column_id]
                 if new_col_id is not None:
@@ -507,7 +512,7 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame, converted_column
                 # Mixed-types with float and integer
                 updated_column_types.append(TYPE_TO_ID[float])
 
-    return updated_column_types
+    return np.array(updated_column_types)
 
 
 def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type):

From 1c044eeeffbe27ae4921a1f38e443f4eeb44cfcb Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 15 May 2023 17:32:39 +0300
Subject: [PATCH 35/72] numpy arr extend fix

---
 .../data_operations/sklearn_transformations.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index 530eb84320..bb5d2e95d4 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -197,8 +197,8 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             if cols_number_added > 0:
                 # There are new columns in the table
                 col_types = output_data.supplementary_data.column_types['features']
-                col_types += [TYPE_TO_ID[float]] * cols_number_added
-                output_data.supplementary_data.column_types['features'] = col_types
+                new_types = [TYPE_TO_ID[float]] * cols_number_added
+                output_data.supplementary_data.column_types['features'] = np.append(col_types, new_types)
 
 
 class ScalingImplementation(EncodedInvariantImplementation):

From 71560dff597931495fb162e2752f8588c1c60dea Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 15 May 2023 18:22:02 +0300
Subject: [PATCH 36/72] data_types.py cleanup

---
 fedot/preprocessing/data_types.py | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 34cd4cecc9..f354f39ab0 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from copy import copy
-from typing import TYPE_CHECKING, Tuple, Optional, List, Dict
+from typing import TYPE_CHECKING, Tuple, Optional, List, Dict, Sequence
 
 import numpy as np
 import pandas as pd
@@ -199,14 +199,9 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
 
             data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
 
-            # data.supplementary_data.column_types['features'] = [
-            #     col_type_id
-            #     for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features'])
-            #     if col_id not in self.string_columns_transformation_failed
-            # ]
             data.supplementary_data.column_types['features'] = np.delete(
                 data.supplementary_data.column_types['features'],
-                list(self.string_columns_transformation_failed.keys())
+                list(self.string_columns_transformation_failed)
             )
 
     def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list):
@@ -422,7 +417,7 @@ def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) ->
     return frame.loc[:, _cols_have_any]
 
 
-def apply_type_transformation(table: np.ndarray, column_types: list, log: LoggerAdapter):
+def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: LoggerAdapter):
     """
     Apply transformation for columns in dataset into desired type. Perform
     transformation on predict stage when column types were already determined
@@ -435,8 +430,7 @@ def type_by_id(current_type_id: int):
             return int
         elif current_type_id == TYPE_TO_ID[str]:
             return str
-        else:
-            return float
+        return float
 
     if table is None:
         # Occurs if for predict stage there is no target info
@@ -470,12 +464,12 @@ def _obtain_new_column_type(column_info: pd.Series):
 
 
 def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray,
-                                              column_id: int, current_type, log: LoggerAdapter):
+                                              column_id: int, current_type: type, log: LoggerAdapter):
     try:
         table[:, column_id] = current_column.astype(current_type)
         if current_type is str:
-            is_any_comma = any(map(lambda el: ',' in el, current_column))
-            is_any_dot = any(map(lambda el: '.' in el, current_column))
+            is_any_comma = any(',' in el for el in current_column)
+            is_any_dot = any('.' in el for el in current_column)
             # Most likely case: '20,000' must be converted into '20.000'
             if is_any_comma and is_any_dot:
                 warning = f'Column {column_id} contains both "." and ",". Standardize it.'
@@ -515,21 +509,20 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame,
     return np.array(updated_column_types)
 
 
-def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type):
+def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type: type):
     """ Process column values one by one and try to convert them into desirable type.
     If not successful replace with np.nan """
 
     def _process_str_numbers_with_dots_and_commas(value: str):
         """ Try to process str with replacing ',' by '.' in case it was meant to be a number """
         value = value.replace(',', '.')
+        new_value = np.nan
         try:
             # Since "10.6" can not be converted to 10 straightforward using int()
             if current_type is int:
                 new_value = int(float(value))
-            else:
-                new_value = current_type(value)
         except ValueError:
-            return np.nan
+            pass
         return new_value
 
     new_column = []

From 9e89a927c0f5c7be9160e6360052a7a25c4196d0 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 15 May 2023 18:36:25 +0300
Subject: [PATCH 37/72] lint fixes

---
 .../models/discriminant_analysis.py           |  1 -
 .../repository/operation_types_repository.py  |  2 +-
 fedot/preprocessing/base_preprocessing.py     | 24 +++++++++----------
 fedot/preprocessing/data_types.py             | 14 +++++------
 fedot/preprocessing/dummy_preprocessing.py    | 24 +++++++++----------
 test/unit/data/test_supplementary_data.py     |  2 ++
 6 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py
index 317e3d41a4..a0d7bf2a86 100644
--- a/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py
+++ b/fedot/core/operations/evaluation/operation_implementations/models/discriminant_analysis.py
@@ -1,7 +1,6 @@
 from typing import Optional
 
 import numpy as np
-import pandas as pd
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 
 from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ModelImplementation
diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py
index d300f10f15..7e42d95e60 100644
--- a/fedot/core/repository/operation_types_repository.py
+++ b/fedot/core/repository/operation_types_repository.py
@@ -224,7 +224,7 @@ def get_strategies_by_metadata(metadata: dict) -> Union['EvaluationStrategy', Di
 
         Args:
             metadata: information about meta of the operation
-        
+
         Returns:
             available strategies for current metadata
         """
diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
index a3244559c7..ae0ef29140 100644
--- a/fedot/preprocessing/base_preprocessing.py
+++ b/fedot/preprocessing/base_preprocessing.py
@@ -54,8 +54,8 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) ->
         raise AbstractMethodNotImplementError
 
     @abstractmethod
-    def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[InputData,
-                                                                                              MultiModalData]:
+    def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]
+                                       ) -> Union[InputData, MultiModalData]:
         """
         Performs obligatory preprocessing for pipeline's predict method.
 
@@ -68,8 +68,8 @@ def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData])
         raise AbstractMethodNotImplementError
 
     @abstractmethod
-    def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[InputData,
-                                                                                                  MultiModalData]:
+    def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]
+                                 ) -> Union[InputData, MultiModalData]:
         """
         Launches preprocessing operations if it is necessary for pipeline fitting.
 
@@ -83,8 +83,8 @@ def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalDa
         raise AbstractMethodNotImplementError
 
     @abstractmethod
-    def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[InputData,
-                                                                                                      MultiModalData]:
+    def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]
+                                     ) -> Union[InputData, MultiModalData]:
         """
         Launches preprocessing operations if it is necessary for pipeline predict stage.
         Preprocessor must be already fitted.
@@ -135,8 +135,8 @@ def apply_inverse_target_encoding(self, column_to_transform: np.ndarray) -> np.n
         raise AbstractMethodNotImplementError
 
     @abstractmethod
-    def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData]) -> \
-            Union[InputData, MultiModalData]:
+    def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData]
+                                ) -> Union[InputData, MultiModalData]:
         """
         Converts provided data's and pipeline's indexes for fit
 
@@ -150,8 +150,8 @@ def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, M
         raise AbstractMethodNotImplementError
 
     @abstractmethod
-    def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> \
-            Union[InputData, MultiModalData]:
+    def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData]
+                                    ) -> Union[InputData, MultiModalData]:
         """
         Converts provided data's and pipeline's indexes for predict
 
@@ -178,8 +178,8 @@ def restore_index(self, input_data: InputData, result: OutputData) -> OutputData
         raise AbstractMethodNotImplementError
 
     @abstractmethod
-    def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]) -> Union[InputData,
-                                                                                                   MultiModalData]:
+    def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]
+                                       ) -> Union[InputData, MultiModalData]:
         """
         Replaces indices for time series for predict stage
 
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index f354f39ab0..3feb294ab3 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -354,8 +354,8 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         # If column contains a lot of '?' or 'x' as nans equivalents
         # add it remove list
         is_of_mistakes = (
-                (self.acceptable_failed_rate_bottom <= failed_ratio)
-                & (failed_ratio < self.acceptable_failed_rate_top))
+            (self.acceptable_failed_rate_bottom <= failed_ratio) &
+            (failed_ratio < self.acceptable_failed_rate_top))
         self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index))
 
     def _into_numeric_features_transformation_for_predict(self, data: InputData):
@@ -392,11 +392,11 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
     }
     types_counts = (
         table_of_types
-            .apply(pd.value_counts, dropna=False)
-            .reindex(counts_index_mapper.keys(), copy=False)
-            .replace(np.nan, 0)
-            .rename(index=counts_index_mapper, copy=False)
-            .astype(int)
+        .apply(pd.value_counts, dropna=False)
+        .reindex(counts_index_mapper.keys(), copy=False)
+        .replace(np.nan, 0)
+        .rename(index=counts_index_mapper, copy=False)
+        .astype(int)
     )
 
     # Build dataframe with nans indices
diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py
index b088938c6c..36b76a390c 100644
--- a/fedot/preprocessing/dummy_preprocessing.py
+++ b/fedot/preprocessing/dummy_preprocessing.py
@@ -25,18 +25,18 @@ def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) ->
         BasePreprocessor.mark_as_preprocessed(data)
         return data
 
-    def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def obligatory_prepare_for_predict(self, data: Union[InputData, MultiModalData]
+                                       ) -> Union[InputData, MultiModalData]:
         BasePreprocessor.mark_as_preprocessed(data)
         return data
 
-    def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def optional_prepare_for_fit(self, pipeline, data: Union[InputData, MultiModalData]
+                                 ) -> Union[InputData, MultiModalData]:
         BasePreprocessor.mark_as_preprocessed(data, is_obligatory=False)
         return data
 
-    def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def optional_prepare_for_predict(self, pipeline, data: Union[InputData, MultiModalData]
+                                     ) -> Union[InputData, MultiModalData]:
         BasePreprocessor.mark_as_preprocessed(data, is_obligatory=False)
         return data
 
@@ -49,17 +49,17 @@ def cut_dataset(self, data: InputData, border: int):
     def apply_inverse_target_encoding(self, column_to_transform: np.ndarray) -> np.ndarray:
         return column_to_transform
 
-    def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def convert_indexes_for_fit(self, pipeline: 'Pipeline', data: Union[InputData, MultiModalData]
+                                ) -> Union[InputData, MultiModalData]:
         return data
 
-    def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModalData]
+                                    ) -> Union[InputData, MultiModalData]:
         return data
 
     def restore_index(self, input_data: InputData, result: OutputData) -> OutputData:
         return result
 
-    def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]) -> Union[
-        InputData, MultiModalData]:
+    def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]
+                                       ) -> Union[InputData, MultiModalData]:
         return test_data
diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py
index 5d5581139e..152d62636d 100644
--- a/test/unit/data/test_supplementary_data.py
+++ b/test/unit/data/test_supplementary_data.py
@@ -12,6 +12,8 @@
 from fedot.preprocessing.data_types import TYPE_TO_ID
 from test.unit.tasks.test_regression import get_synthetic_regression_data
 
+from test.unit.data.test_data_merge import unequal_outputs_table  # noqa, fixture
+
 
 @pytest.fixture()
 def outputs_table_with_different_types():

From b91a993441b8905d679d028264042f9ca960f6d1 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 15 May 2023 18:39:18 +0300
Subject: [PATCH 38/72] lint fixes (v2)

---
 test/unit/data/test_supplementary_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py
index 152d62636d..0318f2e19d 100644
--- a/test/unit/data/test_supplementary_data.py
+++ b/test/unit/data/test_supplementary_data.py
@@ -47,7 +47,7 @@ def generate_straight_pipeline():
     return pipeline
 
 
-def test_parent_mask_correct(unequal_outputs_table):
+def test_parent_mask_correct(unequal_outputs_table):  # noqa, fixture
     """ Test correctness of function for tables mask generation """
     correct_parent_mask = {'input_ids': [0, 1], 'flow_lens': [1, 0]}
 

From 74fe8b8f9712f736af65830ea8a828cb66442a0a Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 16 May 2023 18:35:31 +0300
Subject: [PATCH 39/72] supp_data typing upd

---
 fedot/core/data/supplementary_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py
index 2456617c75..d4238818e8 100644
--- a/fedot/core/data/supplementary_data.py
+++ b/fedot/core/data/supplementary_data.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Dict
 
 import numpy as np
 
@@ -28,7 +28,7 @@ class SupplementaryData:
     # Collection with non-int indexes
     non_int_idx: Optional[list] = None
     # Dictionary with features and target column types
-    column_types: Optional[dict] = None
+    column_types: Optional[Dict[str, np.ndarray]] = None
 
     @property
     def compound_mask(self):

From d3534d07c42892c35ff04295b128f1afe157937f Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 16 May 2023 19:00:24 +0300
Subject: [PATCH 40/72] ensure all column_types are of ndarray type

---
 .../data/merge/supplementary_data_merger.py   | 11 ++++----
 .../data_operations/categorical_encoders.py   | 15 ++++-------
 .../data_operations/sklearn_selectors.py      | 18 ++++++-------
 .../sklearn_transformations.py                |  6 ++---
 .../data_operations/ts_transformations.py     | 11 ++++----
 fedot/core/operations/model.py                | 22 ++++++++++------
 fedot/preprocessing/categorical.py            |  5 ++--
 fedot/preprocessing/data_types.py             | 23 ++++++++--------
 fedot/preprocessing/preprocessing.py          |  2 +-
 test/unit/data/test_supplementary_data.py     | 11 ++++----
 .../test_data_operations_implementations.py   | 26 ++++++++++---------
 .../test_preprocessing_through_api.py         |  2 +-
 test/unit/preprocessing/test_preprocessors.py | 12 ++++-----
 13 files changed, 81 insertions(+), 83 deletions(-)

diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py
index 1d1c200cae..96cd509460 100644
--- a/fedot/core/data/merge/supplementary_data_merger.py
+++ b/fedot/core/data/merge/supplementary_data_merger.py
@@ -1,5 +1,6 @@
 from typing import List, Dict
 
+import numpy as np
 from golem.core.log import default_log
 
 from fedot.core.data.data import OutputData
@@ -83,7 +84,7 @@ def merge_column_types(self) -> Dict:
 
         # Concatenate types for features columns and
         #  choose target type of the main target as the new target type
-        new_features_types = []
+        new_feature_types = []
         new_target_types = None
         for output in self.outputs:
             if output.supplementary_data.column_types is None:
@@ -92,12 +93,10 @@ def merge_column_types(self) -> Dict:
                 output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict,
                                                                                               output.target,
                                                                                               output.task)
-            col_types = output.supplementary_data.column_types['features']
-            new_features_types.extend(col_types)
+            feature_types = output.supplementary_data.column_types['features']
+            new_feature_types.extend(feature_types)
 
             if output.supplementary_data.is_main_target:
                 # Target can be None for predict stage
                 new_target_types = output.supplementary_data.column_types.get('target')
-
-        column_types = {'features': new_features_types, 'target': new_target_types}
-        return column_types
+        return {'features': np.array(new_feature_types), 'target': new_target_types}
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 505d2610db..7d9894cc5a 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -74,12 +74,12 @@ def _update_column_types(self, output_data: OutputData):
         """ Update column types after encoding. Categorical columns becomes integer with extension """
         if self.categorical_ids:
             # There are categorical features in the table
-            col_types = output_data.supplementary_data.column_types['features']
-            numerical_columns = [t_name for t_name in col_types if t_name != TYPE_TO_ID[str]]
+            feature_types = output_data.supplementary_data.column_types['features']
+            numerical_columns = feature_types[np.isin(feature_types, TYPE_TO_ID[str], invert=True)]
 
             # Calculate new binary columns number after encoding
             encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
-            numerical_columns += [TYPE_TO_ID[int]] * encoded_columns_number
+            numerical_columns = np.append(numerical_columns, [TYPE_TO_ID[int]] * encoded_columns_number)
 
             output_data.encoded_idx = self.encoded_ids
             output_data.supplementary_data.column_types['features'] = numerical_columns
@@ -146,13 +146,8 @@ def transform(self, input_data: InputData) -> OutputData:
 
     def _update_column_types(self, output_data: OutputData):
         """ Update column types after encoding. Categorical becomes integer """
-        if self.categorical_ids:
-            # Categorical features were in the dataset
-            col_types = output_data.supplementary_data.column_types['features']
-            for categorical_id in self.categorical_ids:
-                col_types[categorical_id] = TYPE_TO_ID[int]
-
-            output_data.supplementary_data.column_types['features'] = col_types
+        feature_types = output_data.supplementary_data.column_types['features']
+        feature_types[self.categorical_ids] = TYPE_TO_ID[int]
 
     def _fit_label_encoders(self, input_data: InputData):
         """ Fit LabelEncoder for every categorical column in the dataset """
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
index aa052d7a51..23c56329e1 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
@@ -77,16 +77,14 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
         """ Update column types after applying feature selection operations """
         if len(source_features_shape) < 2:
             return output_data
-        else:
-            if self.features_columns_number > 1:
-                cols_number_removed = source_features_shape[1] - output_data.predict.shape[1]
-                if cols_number_removed > 0:
-                    # There are several columns, which were dropped
-                    col_types = output_data.supplementary_data.column_types['features']
-
-                    # Calculate
-                    remained_column_types = np.array(col_types)[self.remain_features_mask]
-                    output_data.supplementary_data.column_types['features'] = list(remained_column_types)
+        if self.features_columns_number > 1:
+            cols_number_removed = source_features_shape[1] - output_data.predict.shape[1]
+            if cols_number_removed:
+                # There are several columns, which were dropped
+                feature_types = output_data.supplementary_data.column_types['features']
+
+                # Calculate
+                output_data.supplementary_data.column_types['features'] = feature_types[self.remain_features_mask]
 
     def _make_new_table(self, features):
         """
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index bb5d2e95d4..f94103f805 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -89,7 +89,7 @@ def update_column_types(output_data: OutputData) -> OutputData:
         """
 
         _, n_cols = output_data.predict.shape
-        output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float]] * n_cols
+        output_data.supplementary_data.column_types['features'] = np.array([TYPE_TO_ID[float]] * n_cols)
         return output_data
 
 
@@ -196,9 +196,9 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             cols_number_added = output_data.predict.shape[1] - source_features_shape[1]
             if cols_number_added > 0:
                 # There are new columns in the table
-                col_types = output_data.supplementary_data.column_types['features']
+                feature_types = output_data.supplementary_data.column_types['features']
                 new_types = [TYPE_TO_ID[float]] * cols_number_added
-                output_data.supplementary_data.column_types['features'] = np.append(col_types, new_types)
+                output_data.supplementary_data.column_types['features'] = np.append(feature_types, new_types)
 
 
 class ScalingImplementation(EncodedInvariantImplementation):
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
index 74dd395fdb..67ce85ff3d 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
@@ -127,13 +127,14 @@ def _update_column_types(self, output_data: OutputData):
         """Update column types after lagged transformation. All features becomes ``float``
         """
 
-        features_n_rows, features_n_cols = output_data.predict.shape
-        features_column_types = [TYPE_TO_ID[float]] * features_n_cols
-        column_types = {'features': features_column_types}
+        _, features_n_cols = output_data.predict.shape
+        feature_types = np.array([TYPE_TO_ID[float]] * features_n_cols)
+        column_types = {'features': feature_types}
 
         if output_data.target is not None and len(output_data.target.shape) > 1:
-            target_n_rows, target_n_cols = output_data.target.shape
-            column_types.update({'target': [TYPE_TO_ID[float]] * target_n_cols})
+            _, target_n_cols = output_data.target.shape
+            target_types = np.array([TYPE_TO_ID[float]] * target_n_cols)
+            column_types['target'] = target_types
         output_data.supplementary_data.column_types = column_types
 
     def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array,
diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py
index 9e38f5e27c..250016fc74 100644
--- a/fedot/core/operations/model.py
+++ b/fedot/core/operations/model.py
@@ -35,33 +35,39 @@ def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> Ou
         # Add information about features
         if is_regression_task or is_ts_forecasting_task:
             if len(predict_shape) < 2:
-                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
+                column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
             else:
-                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+                column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
         else:
             if len(predict_shape) < 2:
                 output_data.predict = output_data.predict.reshape((-1, 1))
                 predict_shape = output_data.predict.shape
             # Classification task or clustering
             target_type = int if output_mode == 'labels' else float
-            column_info = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]}
+            column_types = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]}
+
+        # Make feature types static to suit supplementary data contract
+        column_types['features'] = np.array(column_types['features'])
 
         # Add information about target
         target_shape = output_data.target.shape if output_data.target is not None else None
         if target_shape is None:
             # There is no target column in output data
-            output_data.supplementary_data.column_types = column_info
+            output_data.supplementary_data.column_types = column_types
             return output_data
 
         if is_regression_task or is_ts_forecasting_task:
             if len(target_shape) > 1:
-                column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]})
+                column_types['target'] = [TYPE_TO_ID[float]] * target_shape[1]
             else:
                 # Array present "time series"
-                column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)})
+                column_types['target'] = [TYPE_TO_ID[float]] * len(output_data.target)
         else:
             # Classification task or clustering
-            column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]})
+            column_types['target'] = [TYPE_TO_ID[int]] * predict_shape[1]
+
+        # Make target types static to suit supplementary data contract
+        column_types['target'] = np.array(column_types['target'])
 
-        output_data.supplementary_data.column_types = column_info
+        output_data.supplementary_data.column_types = column_types
         return output_data
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index fb2e59ee9e..10e7abb339 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -63,9 +63,8 @@ def transform(self, input_data: InputData) -> InputData:
         self._apply_encoder(copied_data.features)
 
         # Update features types
-        features_types = copied_data.supplementary_data.column_types['features']
-        for converted_column_id in self.binary_ids_to_convert:
-            features_types[converted_column_id] = TYPE_TO_ID[int]
+        feature_types = copied_data.supplementary_data.column_types['features']
+        feature_types[self.binary_ids_to_convert] = TYPE_TO_ID[int]
         return copied_data
 
     def fit_transform(self, input_data: InputData) -> InputData:
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 3feb294ab3..95d1abb6db 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from copy import copy
 from typing import TYPE_CHECKING, Tuple, Optional, List, Dict, Sequence
 
 import numpy as np
@@ -93,8 +92,8 @@ def convert_data_for_fit(self, data: InputData):
         # Launch conversion float and integer features into categorical
         self._into_categorical_features_transformation_for_fit(data)
         # Save info about features and target types
-        self.features_types = copy(data.supplementary_data.column_types['features'])
-        self.target_types = copy(data.supplementary_data.column_types['target'])
+        self.features_types = data.supplementary_data.column_types['features'].copy()
+        self.target_types = data.supplementary_data.column_types['target'].copy()
 
         self._retain_columns_info_without_types_conflicts(data)
         return data
@@ -289,8 +288,8 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         Perform automated categorical features determination. If feature column
         contains int or float values with few unique values (less than 13)
         """
-        features_types = data.supplementary_data.column_types['features']
-        is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
+        feature_types = data.supplementary_data.column_types['features']
+        is_numeric_type = np.isin(feature_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
         numeric_type_ids = np.flatnonzero(is_numeric_type)
         num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
         nuniques = num_df.nunique(dropna=True)
@@ -302,7 +301,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         # Columns need to be transformed into categorical (string) ones
         self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str))
         # Update information about column types (in-place)
-        features_types[cat_col_ids] = TYPE_TO_ID[str]
+        feature_types[cat_col_ids] = TYPE_TO_ID[str]
 
     def _into_categorical_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into categorical string column for every signed column """
@@ -317,8 +316,8 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
         data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
 
         # Update information about column types (in-place)
-        features_types = data.supplementary_data.column_types['features']
-        features_types[self.numerical_into_str] = TYPE_TO_ID[str]
+        feature_types = data.supplementary_data.column_types['features']
+        feature_types[self.numerical_into_str] = TYPE_TO_ID[str]
 
     def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
@@ -344,8 +343,8 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
 
         # Update information about column types (in-place)
-        features_types = data.supplementary_data.column_types['features']
-        features_types[is_numeric_ids] = TYPE_TO_ID[float]
+        feature_types = data.supplementary_data.column_types['features']
+        feature_types[is_numeric_ids] = TYPE_TO_ID[float]
 
         # The columns consists mostly of truly str values and has a few ints/floats in it
         is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
@@ -366,8 +365,8 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
         data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
 
         # Update information about column types (in-place)
-        features_types = data.supplementary_data.column_types['features']
-        features_types[str_cols_ids] = TYPE_TO_ID[float]
+        feature_types = data.supplementary_data.column_types['features']
+        feature_types[str_cols_ids] = TYPE_TO_ID[float]
 
 
 def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 5186a1b84c..1c2242e744 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -428,7 +428,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra
         encoded_target = data.target
         if encoder is not None:
             # Target encoders have already been fitted
-            data.supplementary_data.column_types['target'] = [TYPE_TO_ID[int]]
+            data.supplementary_data.column_types['target'] = np.array([TYPE_TO_ID[int]])
             encoded_target = encoder.transform(encoded_target)
             if len(encoded_target.shape) == 1:
                 encoded_target = encoded_target.reshape((-1, 1))
diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py
index 0318f2e19d..b1447a1b85 100644
--- a/test/unit/data/test_supplementary_data.py
+++ b/test/unit/data/test_supplementary_data.py
@@ -10,9 +10,8 @@
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.tasks import Task, TaskTypesEnum
 from fedot.preprocessing.data_types import TYPE_TO_ID
-from test.unit.tasks.test_regression import get_synthetic_regression_data
-
 from test.unit.data.test_data_merge import unequal_outputs_table  # noqa, fixture
+from test.unit.tasks.test_regression import get_synthetic_regression_data
 
 
 @pytest.fixture()
@@ -21,15 +20,15 @@ def outputs_table_with_different_types():
     task = Task(TaskTypesEnum.regression)
     idx = [0, 1, 2]
     target = [1, 2, 10]
-    data_info_first = SupplementaryData(column_types={'features': [TYPE_TO_ID[str], TYPE_TO_ID[float]],
-                                                      'target': [TYPE_TO_ID[int]]})
+    data_info_first = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str], TYPE_TO_ID[float]]),
+                                                      'target': np.array([TYPE_TO_ID[int]])})
     output_first = OutputData(idx=idx, features=None,
                               predict=np.array([['a', 1.1], ['b', 2], ['c', 3]], dtype=object),
                               task=task, target=target, data_type=DataTypesEnum.table,
                               supplementary_data=data_info_first)
 
-    data_info_second = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]],
-                                                       'target': [TYPE_TO_ID[int]]})
+    data_info_second = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]]),
+                                                       'target': np.array([TYPE_TO_ID[int]])})
     output_second = OutputData(idx=idx, features=None,
                                predict=np.array([[2.5], [2.1], [9.3]], dtype=float),
                                task=task, target=target, data_type=DataTypesEnum.table,
diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py
index 8910f154c0..fac04125ae 100644
--- a/test/unit/data_operations/test_data_operations_implementations.py
+++ b/test/unit/data_operations/test_data_operations_implementations.py
@@ -129,7 +129,7 @@ def get_multivariate_time_series(mutli_ts=False):
 
 
 def get_nan_inf_data():
-    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[float]] * 4})
+    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]] * 4)})
     train_input = InputData(idx=[0, 1, 2, 3],
                             features=np.array([[1, 2, 3, 4],
                                                [2, np.nan, 4, 5],
@@ -144,8 +144,8 @@ def get_nan_inf_data():
 
 
 def get_single_feature_data(task=None):
-    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int]],
-                                                'target': [TYPE_TO_ID[int]]})
+    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int]]),
+                                                'target': np.array([TYPE_TO_ID[int]])})
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
                             features=np.array([[1], [2], [3], [7], [8], [9]]),
                             target=np.array([[0], [0], [0], [1], [1], [1]]),
@@ -168,10 +168,11 @@ def get_mixed_data(task=None, extended=False):
                              [np.nan, np.nan, '1', np.nan, '2', 'not blue', 'di'],
                              [8, '1', '1', 0, '1', 'not blue', 'da bu'],
                              [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object)
-        features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
-                          TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]]
+        features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
+                                   TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]])
+        target_types = np.array([TYPE_TO_ID[int]])
         supp_data = SupplementaryData(column_types={'features': features_types,
-                                                    'target': [TYPE_TO_ID[int]]})
+                                                    'target': target_types})
     else:
         features = np.array([[1, '0', 1],
                              [2, '1', 0],
@@ -179,9 +180,10 @@ def get_mixed_data(task=None, extended=False):
                              [7, '1', 1],
                              [8, '1', 1],
                              [9, '0', 0]], dtype=object)
-        features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]
+        features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
+        target_types = np.array([TYPE_TO_ID[int]])
         supp_data = SupplementaryData(column_types={'features': features_types,
-                                                    'target': [TYPE_TO_ID[int]]})
+                                                    'target': target_types})
 
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
                             features=features,
@@ -200,7 +202,7 @@ def get_nan_binary_data(task=None):
     Binary int columns must be processed as "almost categorical". Current dataset
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
-    features_types = [TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]]
+    features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
     supp_data = SupplementaryData(column_types={'features': features_types})
     features = np.array([[1, '0', 0],
                          [np.nan, np.nan, np.nan],
@@ -231,8 +233,8 @@ def get_unbalanced_dataset(size=10, disbalance=0.4, target_dim=None):
         target = target.reshape(-1, 1)
 
     supp_data = SupplementaryData(column_types={
-        'features': [TYPE_TO_ID[int], TYPE_TO_ID[str]],
-        'target': [TYPE_TO_ID[int]]
+        'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[str]]),
+        'target': np.array([TYPE_TO_ID[int]])
     })
 
     input_data = InputData(idx=np.arange(features.shape[0]),
@@ -251,7 +253,7 @@ def data_with_binary_int_features_and_equal_categories():
     must be processed as "almost categorical". Current dataset
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
-    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[int], TYPE_TO_ID[int]]})
+    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[int]])})
     task = Task(TaskTypesEnum.classification)
     features = np.array([[1, 10],
                          [np.nan, np.nan],
diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py
index 3b0e60fc25..5726b041df 100644
--- a/test/unit/preprocessing/test_preprocessing_through_api.py
+++ b/test/unit/preprocessing/test_preprocessing_through_api.py
@@ -11,7 +11,7 @@
 
 def data_with_only_categorical_features():
     """ Generate tabular data with only categorical features. All of them are binary. """
-    supp_data = SupplementaryData(column_types={'features': [TYPE_TO_ID[str]] * 3})
+    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str]] * 3)})
     task = Task(TaskTypesEnum.regression)
     features = np.array([["'a'", "0", "1"],
                          ["'b'", "1", "0"],
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index 562e821a3e..5cdab850d4 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -129,15 +129,15 @@ def test_column_types_converting_correctly():
     types_corr = TableTypesCorrector()
     data = types_corr.convert_data_for_fit(data)
 
-    features_types = data.supplementary_data.column_types['features']
+    feature_types = data.supplementary_data.column_types['features']
     target_types = data.supplementary_data.column_types['target']
 
-    assert len(features_types) == 4
+    assert len(feature_types) == 4
     assert len(target_types) == 2
-    assert features_types[0] == TYPE_TO_ID[str]
-    assert features_types[1] == TYPE_TO_ID[str]
-    assert features_types[2] == TYPE_TO_ID[str]
-    assert target_types[0] == target_types[1] == TYPE_TO_ID[str]
+    assert feature_types[0] == TYPE_TO_ID[str]
+    assert feature_types[1] == TYPE_TO_ID[str]
+    assert feature_types[2] == TYPE_TO_ID[str]
+    assert (target_types == TYPE_TO_ID[str]).all()
 
 
 def test_column_types_process_correctly():

From 08e221e911b76fe43d6ab349ff96a8f1a96ae064 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 17 May 2023 14:30:01 +0300
Subject: [PATCH 41/72] column types naming fix

---
 fedot/core/data/data_preprocessing.py         |  2 +-
 .../data/merge/supplementary_data_merger.py   |  2 +-
 .../data_operations/categorical_encoders.py   |  8 ++---
 .../sklearn_transformations.py                |  4 +--
 fedot/preprocessing/categorical.py            |  4 +--
 fedot/preprocessing/data_types.py             | 30 +++++++++----------
 fedot/preprocessing/preprocessing.py          |  2 +-
 test/unit/data/test_supplementary_data.py     | 10 +++----
 .../test_data_operations_implementations.py   | 22 +++++++-------
 test/unit/preprocessing/test_preprocessors.py |  6 ++--
 10 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index e5565204e6..40e2d23f42 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -54,7 +54,7 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda
 
     Args:
         table: tabular data for string columns types determination.
-        column_type_ids: list with column types. If None, perform default checking.
+        column_type_ids: list with column type ids. If None, perform default checking.
     Returns:
         categorical_ids: indices of categorical columns in table.
         non_categorical_ids: indices of non categorical columns in table.
diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py
index 96cd509460..dc16538e08 100644
--- a/fedot/core/data/merge/supplementary_data_merger.py
+++ b/fedot/core/data/merge/supplementary_data_merger.py
@@ -76,7 +76,7 @@ def prepare_parent_mask(self) -> Dict:
         features_mask = {'input_ids': input_ids, 'flow_lens': flow_lens}
         return features_mask
 
-    def merge_column_types(self) -> Dict:
+    def merge_column_types(self) -> Dict[str, np.ndarray]:
         """ Store information about column types in tabular data for merged data """
         if self.main_output.data_type is not DataTypesEnum.table:
             # Data is not tabular
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 7d9894cc5a..55071ae5f2 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -36,8 +36,8 @@ def fit(self, input_data: InputData):
         :return encoder: trained encoder (optional output)
         """
         features = input_data.features
-        column_type_ids = input_data.supplementary_data.column_types.get('features')
-        self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, column_type_ids)
+        feature_type_ids = input_data.supplementary_data.column_types['features']
+        self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, feature_type_ids)
 
         # If there are categorical features - process it
         if self.categorical_ids:
@@ -119,9 +119,9 @@ def __init__(self, params: Optional[OperationParameters] = None):
         self.non_categorical_ids: List[int] = None
 
     def fit(self, input_data: InputData):
-        column_type_ids = input_data.supplementary_data.column_types.get('features')
+        feature_type_ids = input_data.supplementary_data.column_types['features']
         self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
-                                                                                  column_type_ids)
+                                                                                  feature_type_ids)
 
         # If there are categorical features - process it
         if self.categorical_ids:
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index f94103f805..05d824e320 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -290,9 +290,9 @@ def transform(self, input_data: InputData) -> OutputData:
         replace_inf_with_nans(input_data)
 
         if data_type_is_table(input_data) and data_has_categorical_features(input_data):
-            column_type_ids = input_data.supplementary_data.column_types.get('features')
+            feature_type_ids = input_data.supplementary_data.column_types['features']
             self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
-                                                                                      column_type_ids)
+                                                                                      feature_type_ids)
             numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids,
                                                                        self.non_categorical_ids)
 
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 10e7abb339..162611c28c 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -24,9 +24,9 @@ def fit(self, input_data: InputData):
         Find indices of columns which are contains categorical values. Binary features and at the same time
         has str objects. If there are such features - convert it into int
         """
-        column_type_ids = input_data.supplementary_data.column_types['features']
+        feature_type_ids = input_data.supplementary_data.column_types['features']
         categorical_ids, _ = find_categorical_columns(input_data.features,
-                                                      column_type_ids)
+                                                      feature_type_ids)
 
         binary_ids_to_convert = []
         for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T):
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 95d1abb6db..45bfca42bc 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -12,9 +12,9 @@
     from fedot.core.data.data import InputData
 
 _convertable_types = (bool, float, int, str, type(None))  # preserve lexicographical order
-_types_ids = range(len(_convertable_types))
+_type_ids = range(len(_convertable_types))
 
-TYPE_TO_ID = dict(zip(_convertable_types, _types_ids))
+TYPE_TO_ID = dict(zip(_convertable_types, _type_ids))
 
 _TYPES = 'types'
 _FLOAT_NUMBER = 'float_number'
@@ -64,8 +64,8 @@ def __init__(self):
         self.target_converting_has_errors = False
 
         # Lists with column types for converting calculated on source input data
-        self.features_types = None
-        self.target_types = None
+        self.feature_type_ids = None
+        self.target_type_ids = None
         self.log = default_log(self)
 
     def convert_data_for_fit(self, data: InputData):
@@ -78,7 +78,7 @@ def convert_data_for_fit(self, data: InputData):
         self.target_columns_info = define_column_types(data.target)
 
         # Correct types in features table
-        data.features = self.features_types_converting(features=data.features)
+        data.features = self.feature_types_converting(features=data.features)
         # Remain only correct columns
         data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
 
@@ -92,8 +92,8 @@ def convert_data_for_fit(self, data: InputData):
         # Launch conversion float and integer features into categorical
         self._into_categorical_features_transformation_for_fit(data)
         # Save info about features and target types
-        self.features_types = data.supplementary_data.column_types['features'].copy()
-        self.target_types = data.supplementary_data.column_types['target'].copy()
+        self.feature_type_ids = data.supplementary_data.column_types['features'].copy()
+        self.target_type_ids = data.supplementary_data.column_types.get('target', np.array()).copy()
 
         self._retain_columns_info_without_types_conflicts(data)
         return data
@@ -103,8 +103,8 @@ def convert_data_for_predict(self, data: InputData):
         # Ordering is important because after removing incorrect features - indices are obsolete
         data.features = data.features.astype(object)
         data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
-        data.features = apply_type_transformation(data.features, self.features_types, self.log)
-        data.target = apply_type_transformation(data.target, self.target_types, self.log)
+        data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log)
+        data.target = apply_type_transformation(data.target, self.target_type_ids, self.log)
         data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
                                                                               target=data.target,
                                                                               task=data.task)
@@ -126,7 +126,7 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
         table = np.delete(table, self.columns_to_del, 1)
         return table
 
-    def features_types_converting(self, features: np.ndarray) -> np.ndarray:
+    def feature_types_converting(self, features: np.ndarray) -> np.ndarray:
         """ Convert all elements in the data in every feature column into one type
 
         :param features: tabular features array
@@ -173,20 +173,20 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray =
         if self.features_columns_info.empty:
             # Information about column types is empty - there is a need to launch algorithm to collect info
             self.features_columns_info = define_column_types(predictors)
-            predictors = self.features_types_converting(features=predictors)
+            predictors = self.feature_types_converting(features=predictors)
         if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting:
             self.target_columns_info = define_column_types(target)
             target = self.target_types_converting(target=target, task=task)
 
-        features_types = _generate_list_with_types(self.features_columns_info, self.features_converted_columns)
-        self._check_columns_vs_types_number(predictors, features_types)
+        feature_types = _generate_list_with_types(self.features_columns_info, self.features_converted_columns)
+        self._check_columns_vs_types_number(predictors, feature_types)
 
         if target is None or task.task_type is TaskTypesEnum.ts_forecasting:
-            return {'features': features_types}
+            return {'features': feature_types}
         else:
             target_types = _generate_list_with_types(self.target_columns_info, self.target_converted_columns)
             self._check_columns_vs_types_number(target, target_types)
-            return {'features': features_types, 'target': target_types}
+            return {'features': feature_types, 'target': target_types}
 
     def _retain_columns_info_without_types_conflicts(self, data: InputData):
         """ Update information in supplementary info - retain info only about remained columns.
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 1c2242e744..3ad213c912 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -402,7 +402,7 @@ def _train_target_encoder(self, data: InputData, source_name: str):
             data: data to be encoded
             source_name: name of the data source node
         """
-        categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.column_types['target'])
+        categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.column_types.get('target'))
 
         if categorical_ids:
             # Target is categorical
diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py
index b1447a1b85..7c768392b5 100644
--- a/test/unit/data/test_supplementary_data.py
+++ b/test/unit/data/test_supplementary_data.py
@@ -118,11 +118,11 @@ def test_define_types_after_merging(outputs_table_with_different_types):
     merged_data = DataMerger.get(outputs).merge()
     updated_info = merged_data.supplementary_data
 
-    features_types = updated_info.column_types['features']
-    target_types = updated_info.column_types['target']
+    feature_type_ids = updated_info.column_types['features']
+    target_type_ids = updated_info.column_types['target']
 
     # Target type must stay the same
     ancestor_target_type = outputs[0].supplementary_data.column_types['target'][0]
-    assert target_types[0] == ancestor_target_type
-    assert len(features_types) == 3
-    assert tuple(features_types) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float])
+    assert target_type_ids[0] == ancestor_target_type
+    assert len(feature_type_ids) == 3
+    assert tuple(feature_type_ids) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float])
diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py
index fac04125ae..9064781164 100644
--- a/test/unit/data_operations/test_data_operations_implementations.py
+++ b/test/unit/data_operations/test_data_operations_implementations.py
@@ -168,11 +168,11 @@ def get_mixed_data(task=None, extended=False):
                              [np.nan, np.nan, '1', np.nan, '2', 'not blue', 'di'],
                              [8, '1', '1', 0, '1', 'not blue', 'da bu'],
                              [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object)
-        features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
-                                   TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]])
-        target_types = np.array([TYPE_TO_ID[int]])
-        supp_data = SupplementaryData(column_types={'features': features_types,
-                                                    'target': target_types})
+        feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
+                                  TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]])
+        target_type_ids = np.array([TYPE_TO_ID[int]])
+        supp_data = SupplementaryData(column_types={'features': feature_type_ids,
+                                                    'target': target_type_ids})
     else:
         features = np.array([[1, '0', 1],
                              [2, '1', 0],
@@ -180,10 +180,10 @@ def get_mixed_data(task=None, extended=False):
                              [7, '1', 1],
                              [8, '1', 1],
                              [9, '0', 0]], dtype=object)
-        features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
-        target_types = np.array([TYPE_TO_ID[int]])
-        supp_data = SupplementaryData(column_types={'features': features_types,
-                                                    'target': target_types})
+        feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
+        target_type_ids = np.array([TYPE_TO_ID[int]])
+        supp_data = SupplementaryData(column_types={'features': feature_type_ids,
+                                                    'target': target_type_ids})
 
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
                             features=features,
@@ -202,8 +202,8 @@ def get_nan_binary_data(task=None):
     Binary int columns must be processed as "almost categorical". Current dataset
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
-    features_types = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
-    supp_data = SupplementaryData(column_types={'features': features_types})
+    feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
+    supp_data = SupplementaryData(column_types={'features': feature_type_ids})
     features = np.array([[1, '0', 0],
                          [np.nan, np.nan, np.nan],
                          [0, '2', 1],
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index 5cdab850d4..afb64661a3 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -156,10 +156,10 @@ def test_column_types_process_correctly():
     pipeline.fit(train_data)
     predicted = pipeline.predict(test_data)
 
-    features_types_ids = predicted.supplementary_data.column_types['features']
-    assert len(features_types_ids) == predicted.predict.shape[1]
+    feature_type_ids = predicted.supplementary_data.column_types['features']
+    assert len(feature_type_ids) == predicted.predict.shape[1]
     # All output values are float
-    assert all(feature_type_id == TYPE_TO_ID[float] for feature_type_id in features_types_ids)
+    assert (feature_type_ids == TYPE_TO_ID[float]).all()
 
 
 def test_complicated_table_types_processed_correctly():

From f9e47cf471c79688209abd1d1c3ef2584ba794ac Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 17 May 2023 16:42:07 +0300
Subject: [PATCH 42/72] remove unused f-string signs

---
 fedot/core/data/merge/supplementary_data_merger.py | 2 +-
 fedot/core/data/supplementary_data.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py
index dc16538e08..866f450cc2 100644
--- a/fedot/core/data/merge/supplementary_data_merger.py
+++ b/fedot/core/data/merge/supplementary_data_merger.py
@@ -88,7 +88,7 @@ def merge_column_types(self) -> Dict[str, np.ndarray]:
         new_target_types = None
         for output in self.outputs:
             if output.supplementary_data.column_types is None:
-                self.log.debug(f'Perform determination of column types in DataMerger')
+                self.log.debug('Perform determination of column types in DataMerger')
                 table_corr = TableTypesCorrector()
                 output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict,
                                                                                               output.target,
diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py
index d4238818e8..bd9f8d8881 100644
--- a/fedot/core/data/supplementary_data.py
+++ b/fedot/core/data/supplementary_data.py
@@ -53,7 +53,7 @@ def define_parents(self, unique_features_masks: np.array, task: TaskTypesEnum):
         :param task: task to solve
         """
         if not isinstance(self.previous_operations, list) or len(self.previous_operations) == 1:
-            raise ValueError(f'Data was received from one node but at least two nodes are required')
+            raise ValueError('Data was received from one node but at least two nodes are required')
 
         data_operations = OperationTypesRepository('data_operation').suitable_operation(task_type=task)
 

From 415bb0b242cf490a14026a736d5b1ee90278f775 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Wed, 5 Jul 2023 19:25:09 +0300
Subject: [PATCH 43/72] minor fixes

---
 fedot/core/data/data_preprocessing.py | 6 +++---
 fedot/preprocessing/data_types.py     | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 40e2d23f42..dd710c0f14 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
 
 def replace_inf_with_nans(input_data: InputData):
     features = input_data.features
-    has_infs = ((features == np.inf) | (features == -np.inf))
-    if np.any(has_infs):
-        features[has_infs] = np.nan
+    is_inf = (features == np.inf) | (features == -np.inf)
+    if np.any(is_inf):
+        features[is_inf] = np.nan
 
 
 def replace_nans_with_empty_strings(input_data: InputData):
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 45bfca42bc..55bbe88441 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -93,7 +93,9 @@ def convert_data_for_fit(self, data: InputData):
         self._into_categorical_features_transformation_for_fit(data)
         # Save info about features and target types
         self.feature_type_ids = data.supplementary_data.column_types['features'].copy()
-        self.target_type_ids = data.supplementary_data.column_types.get('target', np.array()).copy()
+        self.target_type_ids = data.supplementary_data.column_types.get(
+            'target', np.empty((self.feature_type_ids.shape[0], 1), dtype=float)
+        ).copy()
 
         self._retain_columns_info_without_types_conflicts(data)
         return data

From 3d7043657fc8df3a36ee72ccdbca8c3648d9a0df Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Fri, 7 Jul 2023 20:32:15 +0300
Subject: [PATCH 44/72] minor dct update fix

---
 fedot/preprocessing/categorical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 162611c28c..449dbbdf8c 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -88,7 +88,7 @@ def _train_encoder(self, column: pd.Series):
         encoder.fit(column)
 
         # Store fitted label encoder for transform method
-        self.binary_encoders.update({column.name: encoder})
+        self.binary_encoders[column.name] = encoder
 
     def _apply_encoder(self, data: np.ndarray):
         """

From 6be9b2c01723e6087f7091242252fccb51f012af Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Fri, 7 Jul 2023 20:32:56 +0300
Subject: [PATCH 45/72] data_types.py further vectorization

---
 fedot/preprocessing/data_types.py | 223 ++++++++++++------------------
 1 file changed, 92 insertions(+), 131 deletions(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 55bbe88441..da0dc272fe 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -135,15 +135,7 @@ def feature_types_converting(self, features: np.ndarray) -> np.ndarray:
         """
         mixed_types_columns = _find_mixed_types_columns(self.features_columns_info)
         cols_with_strings_or_floats = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER, _FLOAT_NUMBER])
-
-        def _update_converted_columns_and_data(column_info: pd.Series):
-            updated_column, new_type_id = self._convert_feature_into_one_type(features[:, column_info.name],
-                                                                              column_info)
-            self.features_converted_columns[column_info.name] = new_type_id
-            if updated_column is not None:
-                features[:, column_info.name] = updated_column
-
-        cols_with_strings_or_floats.apply(_update_converted_columns_and_data)
+        cols_with_strings_or_floats.apply(self._convert_feature_into_one_type, features=features)
 
         return features
 
@@ -155,15 +147,7 @@ def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray:
         """
         mixed_types_columns = _find_mixed_types_columns(self.target_columns_info)
         cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER])
-
-        def _update_converted_columns_and_data(column_info: pd.Series):
-            updated_column, new_type_id = self._convert_target_into_one_type(target[:, column_info.name], column_info,
-                                                                             task)
-            self.target_converted_columns[column_info.name] = new_type_id
-            if updated_column is not None:
-                target[:, column_info.name] = updated_column
-
-        cols_with_strings.apply(_update_converted_columns_and_data)
+        cols_with_strings.apply(self._convert_target_into_one_type, target=target, task=task)
 
         return target
 
@@ -225,43 +209,46 @@ def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index
                     # item is numeric, remove its value
                     data.features[row_id, col_id] = np.nan
 
-    def _convert_feature_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series):
+    def _convert_feature_into_one_type(self, column_info: pd.Series, features: np.ndarray):
         """ Determine new type for current feature column based on the string ratio. And then convert column into it.
 
-        :param mixed_column: one-dimensional array with several data types
+        :param features: one-dimensional array with several data types
         :param column_info: dictionary with information about types in the column
         :param mixed_column_id: index of column in dataset
         """
+        new_type_id = None
         if len(column_info[_TYPES]) == 2 and TYPE_TO_ID[type(None)] in column_info[_TYPES]:
             # Column contain only one data type and nans
-            filtered_types = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]]
-            return mixed_column, filtered_types[0]
-
-        string_objects_number = column_info[_STR_NUMBER]
-        all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum()
-        string_ratio = string_objects_number / all_elements_number
-
-        if string_ratio > 0:
-            suggested_type = str
+            non_nan_type_lst = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]]
+            new_type_id = non_nan_type_lst[0]
         else:
-            suggested_type = _obtain_new_column_type(column_info)
+            string_objects_number = column_info[_STR_NUMBER]
+            all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum()
+            string_ratio = string_objects_number / all_elements_number
 
-        try:
-            mixed_column = mixed_column.astype(suggested_type)
-            # If there were nans in the column - paste nan
-            if column_info[_NAN_NUMBER]:
-                mixed_column = mixed_column.astype(object)
-                mixed_column[column_info[_NAN_IDS]] = np.nan
-                del column_info[_NAN_IDS]
-            return mixed_column, TYPE_TO_ID[suggested_type]
-        except ValueError:
-            # Cannot convert string objects into int or float (for example 'a' into int)
-            prefix = f'Feature column with index {column_info.name} contains ' \
-                     f'following data types: {column_info[_TYPES]}.'
-            self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
-            return None, None
+            if string_ratio > 0:
+                suggested_type = str
+            else:
+                suggested_type = _obtain_new_column_type(column_info)
+
+            try:
+                converted = features[:, column_info.name].astype(suggested_type)
+                # If there were nans in the column - paste nan
+                if column_info[_NAN_NUMBER]:
+                    converted = converted.astype(object)
+                    converted[column_info[_NAN_IDS]] = np.nan
+                    del column_info[_NAN_IDS]
+                features[:, column_info.name] = converted
+            except ValueError:
+                # Cannot convert string objects into int or float (for example 'a' into int)
+                prefix = (f'Feature column with index {column_info.name} contains '
+                          f'the following data types: {column_info[_TYPES]}.')
+                self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
+            else:
+                new_type_id = TYPE_TO_ID[suggested_type]
+        self.features_converted_columns[column_info.name] = new_type_id
 
-    def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: pd.Series,
+    def _convert_target_into_one_type(self, column_info: pd.Series, target: np.ndarray,
                                       task: Task) -> Tuple[np.ndarray, str]:
         """ Convert target columns into one type based on column proportions of object and task """
         if task.task_type is TaskTypesEnum.classification:
@@ -269,21 +256,21 @@ def _convert_target_into_one_type(self, mixed_column: np.ndarray, column_info: p
             suggested_type = str
         else:
             suggested_type = _obtain_new_column_type(column_info)
+        self.target_converted_columns[column_info.name] = TYPE_TO_ID[suggested_type]
 
+        mixed_column = target[:, column_info.name]
         try:
-            mixed_column = mixed_column.astype(suggested_type)
-            return mixed_column, TYPE_TO_ID[suggested_type]
+            target[:, column_info.name] = mixed_column.astype(suggested_type)
         except ValueError:
             # Cannot convert string objects into int or float (for example 'a' into int)
-            target_column = pd.Series(mixed_column)
-            converted_column = pd.to_numeric(target_column, errors='coerce')
+            converted_column = pd.to_numeric(mixed_column, errors='coerce')
 
             prefix = (f'Target column with index {column_info.name} contains '
-                      f'following data types: {column_info[_TYPES]}.')
-            log_message = f'{prefix} String cannot be converted into {suggested_type}. Ignore non converted values.'
+                      f'the following data types: {column_info[_TYPES]}.')
+            log_message = f'{prefix} String cannot be converted into {suggested_type}. Set unconverted values to NaN.'
             self.log.debug(log_message)
             self.target_converting_has_errors = True
-            return converted_column.values, TYPE_TO_ID[suggested_type]
+            target[:, column_info.name] = converted_column
 
     def _into_categorical_features_transformation_for_fit(self, data: InputData):
         """
@@ -295,6 +282,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         numeric_type_ids = np.flatnonzero(is_numeric_type)
         num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
         nuniques = num_df.nunique(dropna=True)
+
         # reduce dataframe to include only categorical features
         num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)]
         cat_col_ids = num_df.columns
@@ -307,10 +295,6 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
 
     def _into_categorical_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into categorical string column for every signed column """
-        if not self.numerical_into_str:
-            # There is no transformation for current table
-            return data
-
         # Get numerical columns
         num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str)
 
@@ -325,10 +309,9 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
         Automatically determine categorical features which should be converted into float
         """
-        str_columns = np.flatnonzero(
-            np.isin(data.supplementary_data.column_types['features'], TYPE_TO_ID[str])
-        )
-        str_cols_df = pd.DataFrame(data.features[:, str_columns], columns=str_columns)
+        is_str_type = data.supplementary_data.column_types['features'] == TYPE_TO_ID[str]
+        str_col_ids = np.flatnonzero(is_str_type)
+        str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
         orig_nans_cnt = str_cols_df.isna().sum(axis=0)
 
         converted_str_cols_df = str_cols_df.apply(pd.to_numeric, errors='coerce')
@@ -348,12 +331,12 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         feature_types = data.supplementary_data.column_types['features']
         feature_types[is_numeric_ids] = TYPE_TO_ID[float]
 
-        # The columns consists mostly of truly str values and has a few ints/floats in it
+        # The columns consist mostly of truly str values and has a few ints/floats in it
         is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
         self._remove_pseudo_str_values_from_str_column(data, is_mixed[is_mixed].index)
 
         # If column contains a lot of '?' or 'x' as nans equivalents
-        # add it remove list
+        # add it to remove list
         is_of_mistakes = (
             (self.acceptable_failed_rate_bottom <= failed_ratio) &
             (failed_ratio < self.acceptable_failed_rate_top))
@@ -361,28 +344,27 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
 
     def _into_numeric_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into float string column for every signed column """
-        str_cols_ids = list(set(self.categorical_into_float)
-                            .difference(self.string_columns_transformation_failed))
-        str_cols_df = pd.DataFrame(data.features[:, str_cols_ids], columns=str_cols_ids)
-        data.features[:, str_cols_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
+        str_col_ids = np.setdiff1d(
+            self.categorical_into_float,
+            list(self.string_columns_transformation_failed)
+        ).astype(int)
+        str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
+        data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
 
         # Update information about column types (in-place)
         feature_types = data.supplementary_data.column_types['features']
-        feature_types[str_cols_ids] = TYPE_TO_ID[float]
+        feature_types[str_col_ids] = TYPE_TO_ID[float]
 
 
 def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
     """ Prepare information about types per columns. For each column store unique
     types, which column contains.
     """
-    if table is None:
-        return pd.DataFrame()
-
     table_of_types = pd.DataFrame(table, copy=True)
     table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8)
 
     # Build dataframe with unique types for each column
-    uniques = table_of_types.apply([pd.unique]).rename(index={'unique': _TYPES})
+    uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T
 
     # Build dataframe with amount of each type
     counts_index_mapper = {
@@ -394,14 +376,18 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
     types_counts = (
         table_of_types
         .apply(pd.value_counts, dropna=False)
-        .reindex(counts_index_mapper.keys(), copy=False)
+        .reindex(counts_index_mapper.keys(), copy=False)  # Sets all type ids
         .replace(np.nan, 0)
-        .rename(index=counts_index_mapper, copy=False)
+        .rename(index=counts_index_mapper, copy=False)  # Renames all type ids to strs
         .astype(int)
     )
 
     # Build dataframe with nans indices
-    nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: _NAN_IDS})
+    nans_ids = (
+        (table_of_types == TYPE_TO_ID[type(None)])
+        .apply(np.flatnonzero, result_type='reduce')
+        .to_frame(_NAN_IDS).T
+    )
 
     # Combine all dataframes
     return pd.concat([uniques, types_counts, nans_ids])
@@ -409,13 +395,13 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
 
 def _find_mixed_types_columns(columns_info: pd.DataFrame) -> pd.DataFrame:
     """ Search for columns with several types in them """
-    has_mixed_types = [] if columns_info.empty else columns_info.loc[_TYPES].apply(len) > 1
+    has_mixed_types = columns_info.loc[_TYPES].apply(len) > 1
     return columns_info.loc[:, has_mixed_types]
 
 
 def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> pd.DataFrame:
-    _cols_have_any = [] if frame.empty else frame.loc[rows_to_select].any()
-    return frame.loc[:, _cols_have_any]
+    cols_have_any = frame.loc[rows_to_select].any()
+    return frame.loc[:, cols_have_any]
 
 
 def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: LoggerAdapter):
@@ -424,27 +410,13 @@ def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: Lo
     transformation on predict stage when column types were already determined
     during fit
     """
+    table_df = pd.DataFrame(table, copy=False)
+    types_sr = pd.Series(column_types).map({
+        **{TYPE_TO_ID[t]: t for t in [int, str]},
+        **{TYPE_TO_ID[t]: float for t in [bool, type(None), float]}
+    })
 
-    def type_by_id(current_type_id: int):
-        """ Return type by its ID """
-        if current_type_id == TYPE_TO_ID[int]:
-            return int
-        elif current_type_id == TYPE_TO_ID[str]:
-            return str
-        return float
-
-    if table is None:
-        # Occurs if for predict stage there is no target info
-        return None
-
-    _, n_cols = table.shape
-    for column_id in range(n_cols):
-        current_column = table[:, column_id]
-        current_type = type_by_id(column_types[column_id])
-        _convert_predict_column_into_desired_type(table=table, current_column=current_column, current_type=current_type,
-                                                  column_id=column_id, log=log)
-
-    return table
+    return table_df.apply(_convert_predict_column_into_desired_type, types_sr=types_sr, log=log).to_numpy()
 
 
 def convert_num_column_into_string_array(numerical_column: pd.Series) -> pd.Series:
@@ -464,20 +436,19 @@ def _obtain_new_column_type(column_info: pd.Series):
     return int
 
 
-def _convert_predict_column_into_desired_type(table: np.ndarray, current_column: np.ndarray,
-                                              column_id: int, current_type: type, log: LoggerAdapter):
+def _convert_predict_column_into_desired_type(current_column: pd.Series, types_sr: pd.Series, log: LoggerAdapter):
+    current_type = types_sr.loc[current_column.name]
     try:
-        table[:, column_id] = current_column.astype(current_type)
+        converted_column = current_column.astype(current_type)
         if current_type is str:
-            is_any_comma = any(',' in el for el in current_column)
-            is_any_dot = any('.' in el for el in current_column)
-            # Most likely case: '20,000' must be converted into '20.000'
-            if is_any_comma and is_any_dot:
-                warning = f'Column {column_id} contains both "." and ",". Standardize it.'
+            has_comma_and_dot = np.isin(['.', ','], current_column).all()
+            if has_comma_and_dot:
+                # Most likely case: '20,000' must be converted into '20.000'
+                warning = f'Column {current_column.name} contains both "." and ",". Standardize it.'
                 log.warning(warning)
     except ValueError:
-        table[:, column_id] = _process_predict_column_values_one_by_one(current_column=current_column,
-                                                                        current_type=current_type)
+        converted_column = current_column.apply(_process_predict_column_values_one_by_one, current_type=current_type)
+    return converted_column
 
 
 def _generate_list_with_types(columns_types_info: pd.DataFrame,
@@ -510,29 +481,19 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame,
     return np.array(updated_column_types)
 
 
-def _process_predict_column_values_one_by_one(current_column: np.ndarray, current_type: type):
+def _process_predict_column_values_one_by_one(value, current_type: type):
     """ Process column values one by one and try to convert them into desirable type.
     If not successful replace with np.nan """
-
-    def _process_str_numbers_with_dots_and_commas(value: str):
-        """ Try to process str with replacing ',' by '.' in case it was meant to be a number """
-        value = value.replace(',', '.')
-        new_value = np.nan
-        try:
-            # Since "10.6" can not be converted to 10 straightforward using int()
-            if current_type is int:
-                new_value = int(float(value))
-        except ValueError:
-            pass
-        return new_value
-
-    new_column = []
-    for value in current_column:
-        new_value = np.nan
-        try:
-            new_value = current_type(value)
-        except ValueError:
-            if isinstance(value, str) and ('.' in value or ',' in value):
-                new_value = _process_str_numbers_with_dots_and_commas(value=value)
-        new_column.append(new_value)
-    return new_column
+    new_value = np.nan
+    try:
+        new_value = current_type(value)
+    except ValueError:
+        if isinstance(value, str) and ('.' in value or ',' in value):
+            value = value.replace(',', '.')
+            try:
+                # Since "10.6" can not be converted to 10 straightforward using int()
+                if current_type is int:
+                    new_value = int(float(value))
+            except ValueError:
+                pass
+    return new_value

From d39e623a2c1db74310e4bd73a99c6d6939c749b9 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 10 Jul 2023 16:40:00 +0300
Subject: [PATCH 46/72] preprocessing simplifications and logical fixes

---
 fedot/core/data/data_preprocessing.py         |  2 +-
 .../data_operations/categorical_encoders.py   | 33 ++++++++-----------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index dd710c0f14..26bc1ea462 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -63,7 +63,7 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda
         # Define if data contains string columns for "unknown table"
         return force_categorical_determination(table)
 
-    is_str = np.isin(column_type_ids, TYPE_TO_ID[str])
+    is_str = column_type_ids == TYPE_TO_ID[str]
     categorical_ids = np.flatnonzero(is_str).tolist()
     non_categorical_ids = np.flatnonzero(~is_str).tolist()
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 55071ae5f2..34e08f5a45 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -1,8 +1,7 @@
 from copy import deepcopy
-from typing import Optional, List
+from typing import List, Optional
 
 import numpy as np
-import pandas as pd
 
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
@@ -75,7 +74,7 @@ def _update_column_types(self, output_data: OutputData):
         if self.categorical_ids:
             # There are categorical features in the table
             feature_types = output_data.supplementary_data.column_types['features']
-            numerical_columns = feature_types[np.isin(feature_types, TYPE_TO_ID[str], invert=True)]
+            numerical_columns = feature_types[feature_types != TYPE_TO_ID[str]]
 
             # Calculate new binary columns number after encoding
             encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
@@ -123,10 +122,8 @@ def fit(self, input_data: InputData):
         self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
                                                                                   feature_type_ids)
 
-        # If there are categorical features - process it
-        if self.categorical_ids:
-            # For every categorical feature - perform encoding
-            self._fit_label_encoders(input_data)
+        # For every existing categorical feature - perform encoding
+        self._fit_label_encoders(input_data.features)
         return self.encoders
 
     def transform(self, input_data: InputData) -> OutputData:
@@ -134,9 +131,8 @@ def transform(self, input_data: InputData) -> OutputData:
         Applicable during predict stage
         """
         copied_data = deepcopy(input_data)
-        if self.categorical_ids:
-            # If categorical features exists - transform them inplace in InputData
-            self._apply_label_encoder(copied_data.features)
+        # If categorical features exist - transform them inplace in InputData
+        self._apply_label_encoder(copied_data.features)
 
         output_data = self._convert_to_output(copied_data,
                                               copied_data.features)
@@ -149,14 +145,13 @@ def _update_column_types(self, output_data: OutputData):
         feature_types = output_data.supplementary_data.column_types['features']
         feature_types[self.categorical_ids] = TYPE_TO_ID[int]
 
-    def _fit_label_encoders(self, input_data: InputData):
+    def _fit_label_encoders(self, data: np.ndarray):
         """ Fit LabelEncoder for every categorical column in the dataset """
-        for categorical_id in self.categorical_ids:
-            categorical_column = input_data.features[:, categorical_id]
+        categorical_columns = data[:, self.categorical_ids].astype(str)
+        for column_id, column in zip(self.categorical_ids, categorical_columns.T):
             le = LabelEncoder()
-            le.fit(categorical_column)
-
-            self.encoders.update({categorical_id: le})
+            le.fit(column)
+            self.encoders[column_id] = le
 
     def _apply_label_encoder(self, data: np.ndarray):
         """
@@ -165,13 +160,13 @@ def _apply_label_encoder(self, data: np.ndarray):
         Args:
             data: numpy array with all features
         """
-        categorical_columns = data[:, self.categorical_ids]
+        categorical_columns = data[:, self.categorical_ids].astype(str)
         for column_id, column in zip(self.categorical_ids, categorical_columns.T):
             column_encoder = self.encoders[column_id]
-            column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, column)))
+            column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, column)))
 
             transformed_column = column_encoder.transform(column)
-            nan_idxs = np.flatnonzero(pd.isna(column))
+            nan_idxs = np.flatnonzero(column == 'nan')
             if len(nan_idxs):
                 # Store np.nan values
                 transformed_column = transformed_column.astype(object)

From 280822e15d946f1f36e0a025ff66e13baa32990e Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 10 Jul 2023 17:20:00 +0300
Subject: [PATCH 47/72] minor test lint fix

---
 .../data_operations/test_data_operations_implementations.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py
index 9064781164..ad55628231 100644
--- a/test/unit/data_operations/test_data_operations_implementations.py
+++ b/test/unit/data_operations/test_data_operations_implementations.py
@@ -169,7 +169,7 @@ def get_mixed_data(task=None, extended=False):
                              [8, '1', '1', 0, '1', 'not blue', 'da bu'],
                              [9, '0', '0', 0, '0', 'not blue', 'dai']], dtype=object)
         feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
-                                  TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]])
+                                     TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]])
         target_type_ids = np.array([TYPE_TO_ID[int]])
         supp_data = SupplementaryData(column_types={'features': feature_type_ids,
                                                     'target': target_type_ids})

From e2e287afcae0b6593d3e5940cce4c99ab17247c8 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 10 Jul 2023 17:59:37 +0300
Subject: [PATCH 48/72] minor polishing

---
 .../data_operations/categorical_encoders.py   | 31 +++++++------------
 fedot/preprocessing/categorical.py            |  8 ++---
 2 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 34e08f5a45..dc6582a3e9 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -40,7 +40,7 @@ def fit(self, input_data: InputData):
 
         # If there are categorical features - process it
         if self.categorical_ids:
-            updated_cat_features = np.array(features[:, self.categorical_ids], dtype=str)
+            updated_cat_features = features[:, self.categorical_ids].astype(str)
             self.encoder.fit(updated_cat_features)
 
         return self.encoder
@@ -55,13 +55,10 @@ def transform(self, input_data: InputData) -> OutputData:
         """
         copied_data = deepcopy(input_data)
 
-        features = copied_data.features
-        if not self.categorical_ids:
-            # If there are no categorical features in the table
-            transformed_features = features
-        else:
-            # If categorical features are exists
-            transformed_features = self._apply_one_hot_encoding(features)
+        transformed_features = copied_data.features
+        if self.categorical_ids:
+            # If categorical features exist
+            transformed_features = self._apply_one_hot_encoding(transformed_features)
 
         # Update features
         output_data = self._convert_to_output(copied_data,
@@ -90,19 +87,13 @@ def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray:
         :param features: tabular data for processing
         :return transformed_features: transformed features table
         """
+        transformed_categorical = self.encoder.transform(features[:, self.categorical_ids]).toarray()
 
-        categorical_features = np.array(features[:, self.categorical_ids])
-        transformed_categorical = self.encoder.transform(categorical_features).toarray()
-
-        # If there are non-categorical features in the data
-        if not self.non_categorical_ids:
-            transformed_features = transformed_categorical
-        else:
-            # Stack transformed categorical and non-categorical data
-            non_categorical_features = np.array(features[:, self.non_categorical_ids])
-            frames = (non_categorical_features, transformed_categorical)
-            transformed_features = np.hstack(frames)
-            self.encoded_ids = np.array(range(non_categorical_features.shape[1], transformed_features.shape[1]))
+        # Stack transformed categorical and non-categorical data, ignore if none
+        non_categorical_features = features[:, self.non_categorical_ids]
+        frames = (non_categorical_features, transformed_categorical)
+        transformed_features = np.hstack(frames)
+        self.encoded_ids = np.array(range(non_categorical_features.shape[1], transformed_features.shape[1]))
 
         return transformed_features
 
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 449dbbdf8c..bde52fed7e 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -32,7 +32,7 @@ def fit(self, input_data: InputData):
         for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T):
             pd_column = pd.Series(column, name=column_id, copy=True)
             is_nan = pd_column.isna()
-            column_nuniques = pd_column.nunique(False)
+            column_nuniques = pd_column.nunique(dropna=False)
             if is_nan.sum():
                 # This categorical column has nans
                 pd_column[is_nan] = FEDOT_STR_NAN
@@ -55,10 +55,6 @@ def transform(self, input_data: InputData) -> InputData:
         """
         Apply transformation (converting str into integers) for selected (while training) features.
         """
-        if len(self.binary_ids_to_convert) == 0:
-            # There are no binary categorical features
-            return input_data
-
         copied_data = deepcopy(input_data)
         self._apply_encoder(copied_data.features)
 
@@ -107,7 +103,7 @@ def _apply_encoder(self, data: np.ndarray):
 
             converted = encoder.transform(column)
             if len(nan_idxs):
-                # Column has nans in its structure - after conversion replace it
+                # Column has nans in its structure - replace them after conversion
                 converted = converted.astype(float)
                 converted[nan_idxs] = np.nan
             data[:, column_id] = converted

From 752b4aca28813699c47ae95a6c4a2d31e041b8df Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 11 Jul 2023 17:47:58 +0300
Subject: [PATCH 49/72] applymap simplification data_types.py

---
 fedot/preprocessing/data_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index da0dc272fe..7643c0cc3d 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -361,7 +361,7 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
     types, which column contains.
     """
     table_of_types = pd.DataFrame(table, copy=True)
-    table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8)
+    table_of_types = table_of_types.replace(np.nan, None).applymap(lambda el: TYPE_TO_ID[type(el)])
 
     # Build dataframe with unique types for each column
     uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T

From 0148e3530bdc16f222559a57931af1af16f755e9 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Tue, 11 Jul 2023 17:53:02 +0300
Subject: [PATCH 50/72] test_pipeline.py increase time constraint

---
 test/unit/pipelines/test_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/pipelines/test_pipeline.py b/test/unit/pipelines/test_pipeline.py
index 2608e9338f..bfd656c9bc 100644
--- a/test/unit/pipelines/test_pipeline.py
+++ b/test/unit/pipelines/test_pipeline.py
@@ -389,7 +389,7 @@ def test_pipeline_fit_time_constraint():
 
     test_pipeline_second = pipeline_first()
     predicted_second = test_pipeline_second.fit(input_data=train_data,
-                                                time_constraint=datetime.timedelta(seconds=1.6))
+                                                time_constraint=datetime.timedelta(seconds=2.1))
     computation_time_second = test_pipeline_second.computation_time
     assert comp_time_proc_with_first_constraint < comp_time_proc_with_second_constraint
     assert computation_time_first is None

From 432d9eafd22ec73c227e0b5187c31e9ee0e833e5 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 31 Jul 2023 12:59:36 +0300
Subject: [PATCH 51/72] rename all *types to *type_ids

---
 .../data/merge/supplementary_data_merger.py   | 12 ++++----
 .../data_operations/categorical_encoders.py   |  8 ++---
 .../data_operations/sklearn_selectors.py      |  4 +--
 .../sklearn_transformations.py                |  4 +--
 .../data_operations/ts_transformations.py     |  8 ++---
 fedot/preprocessing/categorical.py            |  4 +--
 fedot/preprocessing/data_types.py             | 30 +++++++++----------
 test/unit/preprocessing/test_preprocessors.py | 16 +++++-----
 8 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py
index 866f450cc2..ea73a7464d 100644
--- a/fedot/core/data/merge/supplementary_data_merger.py
+++ b/fedot/core/data/merge/supplementary_data_merger.py
@@ -84,8 +84,8 @@ def merge_column_types(self) -> Dict[str, np.ndarray]:
 
         # Concatenate types for features columns and
         #  choose target type of the main target as the new target type
-        new_feature_types = []
-        new_target_types = None
+        new_feature_type_ids = []
+        new_target_type_ids = None
         for output in self.outputs:
             if output.supplementary_data.column_types is None:
                 self.log.debug('Perform determination of column types in DataMerger')
@@ -93,10 +93,10 @@ def merge_column_types(self) -> Dict[str, np.ndarray]:
                 output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict,
                                                                                               output.target,
                                                                                               output.task)
-            feature_types = output.supplementary_data.column_types['features']
-            new_feature_types.extend(feature_types)
+            feature_type_ids = output.supplementary_data.column_types['features']
+            new_feature_type_ids.extend(feature_type_ids)
 
             if output.supplementary_data.is_main_target:
                 # Target can be None for predict stage
-                new_target_types = output.supplementary_data.column_types.get('target')
-        return {'features': np.array(new_feature_types), 'target': new_target_types}
+                new_target_type_ids = output.supplementary_data.column_types.get('target')
+        return {'features': np.array(new_feature_type_ids), 'target': new_target_type_ids}
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index dc6582a3e9..7c5323d891 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -70,8 +70,8 @@ def _update_column_types(self, output_data: OutputData):
         """ Update column types after encoding. Categorical columns becomes integer with extension """
         if self.categorical_ids:
             # There are categorical features in the table
-            feature_types = output_data.supplementary_data.column_types['features']
-            numerical_columns = feature_types[feature_types != TYPE_TO_ID[str]]
+            feature_type_ids = output_data.supplementary_data.column_types['features']
+            numerical_columns = feature_type_ids[feature_type_ids != TYPE_TO_ID[str]]
 
             # Calculate new binary columns number after encoding
             encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
@@ -133,8 +133,8 @@ def transform(self, input_data: InputData) -> OutputData:
 
     def _update_column_types(self, output_data: OutputData):
         """ Update column types after encoding. Categorical becomes integer """
-        feature_types = output_data.supplementary_data.column_types['features']
-        feature_types[self.categorical_ids] = TYPE_TO_ID[int]
+        feature_type_ids = output_data.supplementary_data.column_types['features']
+        feature_type_ids[self.categorical_ids] = TYPE_TO_ID[int]
 
     def _fit_label_encoders(self, data: np.ndarray):
         """ Fit LabelEncoder for every categorical column in the dataset """
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
index 23c56329e1..1248865b85 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
@@ -81,10 +81,10 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             cols_number_removed = source_features_shape[1] - output_data.predict.shape[1]
             if cols_number_removed:
                 # There are several columns, which were dropped
-                feature_types = output_data.supplementary_data.column_types['features']
+                feature_type_ids = output_data.supplementary_data.column_types['features']
 
                 # Calculate
-                output_data.supplementary_data.column_types['features'] = feature_types[self.remain_features_mask]
+                output_data.supplementary_data.column_types['features'] = feature_type_ids[self.remain_features_mask]
 
     def _make_new_table(self, features):
         """
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index 05d824e320..67a3c9bbe1 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -196,9 +196,9 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             cols_number_added = output_data.predict.shape[1] - source_features_shape[1]
             if cols_number_added > 0:
                 # There are new columns in the table
-                feature_types = output_data.supplementary_data.column_types['features']
+                feature_type_ids = output_data.supplementary_data.column_types['features']
                 new_types = [TYPE_TO_ID[float]] * cols_number_added
-                output_data.supplementary_data.column_types['features'] = np.append(feature_types, new_types)
+                output_data.supplementary_data.column_types['features'] = np.append(feature_type_ids, new_types)
 
 
 class ScalingImplementation(EncodedInvariantImplementation):
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
index 67ce85ff3d..6881b50e9c 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
@@ -128,13 +128,13 @@ def _update_column_types(self, output_data: OutputData):
         """
 
         _, features_n_cols = output_data.predict.shape
-        feature_types = np.array([TYPE_TO_ID[float]] * features_n_cols)
-        column_types = {'features': feature_types}
+        feature_type_ids = np.array([TYPE_TO_ID[float]] * features_n_cols)
+        column_types = {'features': feature_type_ids}
 
         if output_data.target is not None and len(output_data.target.shape) > 1:
             _, target_n_cols = output_data.target.shape
-            target_types = np.array([TYPE_TO_ID[float]] * target_n_cols)
-            column_types['target'] = target_types
+            target_type_ids = np.array([TYPE_TO_ID[float]] * target_n_cols)
+            column_types['target'] = target_type_ids
         output_data.supplementary_data.column_types = column_types
 
     def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array,
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index bde52fed7e..fa5da3583e 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -59,8 +59,8 @@ def transform(self, input_data: InputData) -> InputData:
         self._apply_encoder(copied_data.features)
 
         # Update features types
-        feature_types = copied_data.supplementary_data.column_types['features']
-        feature_types[self.binary_ids_to_convert] = TYPE_TO_ID[int]
+        feature_type_ids = copied_data.supplementary_data.column_types['features']
+        feature_type_ids[self.binary_ids_to_convert] = TYPE_TO_ID[int]
         return copied_data
 
     def fit_transform(self, input_data: InputData) -> InputData:
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 7643c0cc3d..f046306805 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -164,15 +164,15 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray =
             self.target_columns_info = define_column_types(target)
             target = self.target_types_converting(target=target, task=task)
 
-        feature_types = _generate_list_with_types(self.features_columns_info, self.features_converted_columns)
-        self._check_columns_vs_types_number(predictors, feature_types)
+        feature_type_ids = _generate_list_with_types(self.features_columns_info, self.features_converted_columns)
+        self._check_columns_vs_types_number(predictors, feature_type_ids)
 
         if target is None or task.task_type is TaskTypesEnum.ts_forecasting:
-            return {'features': feature_types}
+            return {'features': feature_type_ids}
         else:
-            target_types = _generate_list_with_types(self.target_columns_info, self.target_converted_columns)
-            self._check_columns_vs_types_number(target, target_types)
-            return {'features': feature_types, 'target': target_types}
+            target_type_ids = _generate_list_with_types(self.target_columns_info, self.target_converted_columns)
+            self._check_columns_vs_types_number(target, target_type_ids)
+            return {'features': feature_type_ids, 'target': target_type_ids}
 
     def _retain_columns_info_without_types_conflicts(self, data: InputData):
         """ Update information in supplementary info - retain info only about remained columns.
@@ -277,8 +277,8 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         Perform automated categorical features determination. If feature column
         contains int or float values with few unique values (less than 13)
         """
-        feature_types = data.supplementary_data.column_types['features']
-        is_numeric_type = np.isin(feature_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
+        feature_type_ids = data.supplementary_data.column_types['features']
+        is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
         numeric_type_ids = np.flatnonzero(is_numeric_type)
         num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
         nuniques = num_df.nunique(dropna=True)
@@ -291,7 +291,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         # Columns need to be transformed into categorical (string) ones
         self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str))
         # Update information about column types (in-place)
-        feature_types[cat_col_ids] = TYPE_TO_ID[str]
+        feature_type_ids[cat_col_ids] = TYPE_TO_ID[str]
 
     def _into_categorical_features_transformation_for_predict(self, data: InputData):
         """ Apply conversion into categorical string column for every signed column """
@@ -302,8 +302,8 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
         data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
 
         # Update information about column types (in-place)
-        feature_types = data.supplementary_data.column_types['features']
-        feature_types[self.numerical_into_str] = TYPE_TO_ID[str]
+        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids[self.numerical_into_str] = TYPE_TO_ID[str]
 
     def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
@@ -328,8 +328,8 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
 
         # Update information about column types (in-place)
-        feature_types = data.supplementary_data.column_types['features']
-        feature_types[is_numeric_ids] = TYPE_TO_ID[float]
+        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids[is_numeric_ids] = TYPE_TO_ID[float]
 
         # The columns consist mostly of truly str values and has a few ints/floats in it
         is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
@@ -352,8 +352,8 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
         data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
 
         # Update information about column types (in-place)
-        feature_types = data.supplementary_data.column_types['features']
-        feature_types[str_col_ids] = TYPE_TO_ID[float]
+        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids[str_col_ids] = TYPE_TO_ID[float]
 
 
 def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index afb64661a3..cd2a95f3da 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -129,15 +129,13 @@ def test_column_types_converting_correctly():
     types_corr = TableTypesCorrector()
     data = types_corr.convert_data_for_fit(data)
 
-    feature_types = data.supplementary_data.column_types['features']
-    target_types = data.supplementary_data.column_types['target']
-
-    assert len(feature_types) == 4
-    assert len(target_types) == 2
-    assert feature_types[0] == TYPE_TO_ID[str]
-    assert feature_types[1] == TYPE_TO_ID[str]
-    assert feature_types[2] == TYPE_TO_ID[str]
-    assert (target_types == TYPE_TO_ID[str]).all()
+    feature_type_ids = data.supplementary_data.column_types['features']
+    target_type_ids = data.supplementary_data.column_types['target']
+
+    assert len(feature_type_ids) == 4
+    assert len(target_type_ids) == 2
+    assert (feature_type_ids[[0, 1, 2]] == TYPE_TO_ID[str]).all()
+    assert (target_type_ids == TYPE_TO_ID[str]).all()
 
 
 def test_column_types_process_correctly():

From 3c338d98ba0f8f225daebe13dcc68c171241d0d4 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 31 Jul 2023 13:26:34 +0300
Subject: [PATCH 52/72] rename column_types to col_type_ids

---
 fedot/core/data/data_preprocessing.py         |  2 +-
 .../data/merge/supplementary_data_merger.py   | 12 ++---
 fedot/core/data/supplementary_data.py         |  4 +-
 .../data_operations/categorical_encoders.py   | 10 ++---
 .../data_operations/sklearn_selectors.py      |  4 +-
 .../sklearn_transformations.py                |  8 ++--
 .../data_operations/ts_transformations.py     |  6 +--
 fedot/core/operations/model.py                | 20 ++++-----
 fedot/preprocessing/categorical.py            |  4 +-
 fedot/preprocessing/data_types.py             | 45 ++++++++++---------
 fedot/preprocessing/preprocessing.py          |  4 +-
 test/unit/data/test_supplementary_data.py     | 10 ++---
 .../test_data_operations_implementations.py   | 14 +++---
 .../test_preprocessing_through_api.py         |  2 +-
 test/unit/preprocessing/test_preprocessors.py |  8 ++--
 15 files changed, 77 insertions(+), 76 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 26bc1ea462..b5519034c7 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -98,7 +98,7 @@ def data_has_categorical_features(data: InputData) -> bool:
     if data.data_type is not DataTypesEnum.table:
         return False
 
-    column_type_ids = data.supplementary_data.column_types.get('features')
+    column_type_ids = data.supplementary_data.col_type_ids['features']
     cat_ids, non_cat_ids = find_categorical_columns(data.features, column_type_ids)
     data_has_categorical_columns = len(cat_ids) > 0
 
diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py
index ea73a7464d..6a4c747e4a 100644
--- a/fedot/core/data/merge/supplementary_data_merger.py
+++ b/fedot/core/data/merge/supplementary_data_merger.py
@@ -24,7 +24,7 @@ def merge(self) -> SupplementaryData:
             obligatorily_preprocessed=self.all_preprocessed(),
             optionally_preprocessed=self.all_preprocessed(is_obligatory=False),
             non_int_idx=None,  # is set elsewhere (by preprocessor or during pipeline fit/predict)
-            column_types=self.merge_column_types()
+            col_type_ids=self.merge_column_types()
         )
 
     def calculate_dataflow_len(self) -> int:
@@ -80,23 +80,23 @@ def merge_column_types(self) -> Dict[str, np.ndarray]:
         """ Store information about column types in tabular data for merged data """
         if self.main_output.data_type is not DataTypesEnum.table:
             # Data is not tabular
-            return self.main_output.supplementary_data.column_types
+            return self.main_output.supplementary_data.col_type_ids
 
         # Concatenate types for features columns and
         #  choose target type of the main target as the new target type
         new_feature_type_ids = []
         new_target_type_ids = None
         for output in self.outputs:
-            if output.supplementary_data.column_types is None:
+            if output.supplementary_data.col_type_ids is None:
                 self.log.debug('Perform determination of column types in DataMerger')
                 table_corr = TableTypesCorrector()
-                output.supplementary_data.column_types = table_corr.prepare_column_types_info(output.predict,
+                output.supplementary_data.col_type_ids = table_corr.prepare_column_types_info(output.predict,
                                                                                               output.target,
                                                                                               output.task)
-            feature_type_ids = output.supplementary_data.column_types['features']
+            feature_type_ids = output.supplementary_data.col_type_ids['features']
             new_feature_type_ids.extend(feature_type_ids)
 
             if output.supplementary_data.is_main_target:
                 # Target can be None for predict stage
-                new_target_type_ids = output.supplementary_data.column_types.get('target')
+                new_target_type_ids = output.supplementary_data.col_type_ids.get('target')
         return {'features': np.array(new_feature_type_ids), 'target': new_target_type_ids}
diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py
index bd9f8d8881..8a053be9a9 100644
--- a/fedot/core/data/supplementary_data.py
+++ b/fedot/core/data/supplementary_data.py
@@ -27,8 +27,8 @@ class SupplementaryData:
     optionally_preprocessed: bool = False
     # Collection with non-int indexes
     non_int_idx: Optional[list] = None
-    # Dictionary with features and target column types
-    column_types: Optional[Dict[str, np.ndarray]] = None
+    # Dictionary with features and target column type numeric identificators
+    col_type_ids: Optional[Dict[str, np.ndarray]] = None
 
     @property
     def compound_mask(self):
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 7c5323d891..5d2993417b 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -35,7 +35,7 @@ def fit(self, input_data: InputData):
         :return encoder: trained encoder (optional output)
         """
         features = input_data.features
-        feature_type_ids = input_data.supplementary_data.column_types['features']
+        feature_type_ids = input_data.supplementary_data.col_type_ids['features']
         self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, feature_type_ids)
 
         # If there are categorical features - process it
@@ -70,7 +70,7 @@ def _update_column_types(self, output_data: OutputData):
         """ Update column types after encoding. Categorical columns becomes integer with extension """
         if self.categorical_ids:
             # There are categorical features in the table
-            feature_type_ids = output_data.supplementary_data.column_types['features']
+            feature_type_ids = output_data.supplementary_data.col_type_ids['features']
             numerical_columns = feature_type_ids[feature_type_ids != TYPE_TO_ID[str]]
 
             # Calculate new binary columns number after encoding
@@ -78,7 +78,7 @@ def _update_column_types(self, output_data: OutputData):
             numerical_columns = np.append(numerical_columns, [TYPE_TO_ID[int]] * encoded_columns_number)
 
             output_data.encoded_idx = self.encoded_ids
-            output_data.supplementary_data.column_types['features'] = numerical_columns
+            output_data.supplementary_data.col_type_ids['features'] = numerical_columns
 
     def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray:
         """
@@ -109,7 +109,7 @@ def __init__(self, params: Optional[OperationParameters] = None):
         self.non_categorical_ids: List[int] = None
 
     def fit(self, input_data: InputData):
-        feature_type_ids = input_data.supplementary_data.column_types['features']
+        feature_type_ids = input_data.supplementary_data.col_type_ids['features']
         self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
                                                                                   feature_type_ids)
 
@@ -133,7 +133,7 @@ def transform(self, input_data: InputData) -> OutputData:
 
     def _update_column_types(self, output_data: OutputData):
         """ Update column types after encoding. Categorical becomes integer """
-        feature_type_ids = output_data.supplementary_data.column_types['features']
+        feature_type_ids = output_data.supplementary_data.col_type_ids['features']
         feature_type_ids[self.categorical_ids] = TYPE_TO_ID[int]
 
     def _fit_label_encoders(self, data: np.ndarray):
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
index 1248865b85..8444b4eaf0 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
@@ -81,10 +81,10 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             cols_number_removed = source_features_shape[1] - output_data.predict.shape[1]
             if cols_number_removed:
                 # There are several columns, which were dropped
-                feature_type_ids = output_data.supplementary_data.column_types['features']
+                feature_type_ids = output_data.supplementary_data.col_type_ids['features']
 
                 # Calculate
-                output_data.supplementary_data.column_types['features'] = feature_type_ids[self.remain_features_mask]
+                output_data.supplementary_data.col_type_ids['features'] = feature_type_ids[self.remain_features_mask]
 
     def _make_new_table(self, features):
         """
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index 67a3c9bbe1..df87b7b1b5 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -89,7 +89,7 @@ def update_column_types(output_data: OutputData) -> OutputData:
         """
 
         _, n_cols = output_data.predict.shape
-        output_data.supplementary_data.column_types['features'] = np.array([TYPE_TO_ID[float]] * n_cols)
+        output_data.supplementary_data.col_type_ids['features'] = np.array([TYPE_TO_ID[float]] * n_cols)
         return output_data
 
 
@@ -196,9 +196,9 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             cols_number_added = output_data.predict.shape[1] - source_features_shape[1]
             if cols_number_added > 0:
                 # There are new columns in the table
-                feature_type_ids = output_data.supplementary_data.column_types['features']
+                feature_type_ids = output_data.supplementary_data.col_type_ids['features']
                 new_types = [TYPE_TO_ID[float]] * cols_number_added
-                output_data.supplementary_data.column_types['features'] = np.append(feature_type_ids, new_types)
+                output_data.supplementary_data.col_type_ids['features'] = np.append(feature_type_ids, new_types)
 
 
 class ScalingImplementation(EncodedInvariantImplementation):
@@ -290,7 +290,7 @@ def transform(self, input_data: InputData) -> OutputData:
         replace_inf_with_nans(input_data)
 
         if data_type_is_table(input_data) and data_has_categorical_features(input_data):
-            feature_type_ids = input_data.supplementary_data.column_types['features']
+            feature_type_ids = input_data.supplementary_data.col_type_ids['features']
             self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features,
                                                                                       feature_type_ids)
             numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids,
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
index 6881b50e9c..985762ab29 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
@@ -129,13 +129,13 @@ def _update_column_types(self, output_data: OutputData):
 
         _, features_n_cols = output_data.predict.shape
         feature_type_ids = np.array([TYPE_TO_ID[float]] * features_n_cols)
-        column_types = {'features': feature_type_ids}
+        col_type_ids = {'features': feature_type_ids}
 
         if output_data.target is not None and len(output_data.target.shape) > 1:
             _, target_n_cols = output_data.target.shape
             target_type_ids = np.array([TYPE_TO_ID[float]] * target_n_cols)
-            column_types['target'] = target_type_ids
-        output_data.supplementary_data.column_types = column_types
+            col_type_ids['target'] = target_type_ids
+        output_data.supplementary_data.col_type_ids = col_type_ids
 
     def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array,
                                       forecast_length: int, old_idx: np.array):
diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py
index 250016fc74..99caabc25b 100644
--- a/fedot/core/operations/model.py
+++ b/fedot/core/operations/model.py
@@ -35,39 +35,39 @@ def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> Ou
         # Add information about features
         if is_regression_task or is_ts_forecasting_task:
             if len(predict_shape) < 2:
-                column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
+                col_type_ids = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
             else:
-                column_types = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+                col_type_ids = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
         else:
             if len(predict_shape) < 2:
                 output_data.predict = output_data.predict.reshape((-1, 1))
                 predict_shape = output_data.predict.shape
             # Classification task or clustering
             target_type = int if output_mode == 'labels' else float
-            column_types = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]}
+            col_type_ids = {'features': [TYPE_TO_ID[target_type]] * predict_shape[1]}
 
         # Make feature types static to suit supplementary data contract
-        column_types['features'] = np.array(column_types['features'])
+        col_type_ids['features'] = np.array(col_type_ids['features'])
 
         # Add information about target
         target_shape = output_data.target.shape if output_data.target is not None else None
         if target_shape is None:
             # There is no target column in output data
-            output_data.supplementary_data.column_types = column_types
+            output_data.supplementary_data.col_type_ids = col_type_ids
             return output_data
 
         if is_regression_task or is_ts_forecasting_task:
             if len(target_shape) > 1:
-                column_types['target'] = [TYPE_TO_ID[float]] * target_shape[1]
+                col_type_ids['target'] = [TYPE_TO_ID[float]] * target_shape[1]
             else:
                 # Array present "time series"
-                column_types['target'] = [TYPE_TO_ID[float]] * len(output_data.target)
+                col_type_ids['target'] = [TYPE_TO_ID[float]] * len(output_data.target)
         else:
             # Classification task or clustering
-            column_types['target'] = [TYPE_TO_ID[int]] * predict_shape[1]
+            col_type_ids['target'] = [TYPE_TO_ID[int]] * predict_shape[1]
 
         # Make target types static to suit supplementary data contract
-        column_types['target'] = np.array(column_types['target'])
+        col_type_ids['target'] = np.array(col_type_ids['target'])
 
-        output_data.supplementary_data.column_types = column_types
+        output_data.supplementary_data.col_type_ids = col_type_ids
         return output_data
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index fa5da3583e..c0ea6913eb 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -24,7 +24,7 @@ def fit(self, input_data: InputData):
         Find indices of columns which are contains categorical values. Binary features and at the same time
         has str objects. If there are such features - convert it into int
         """
-        feature_type_ids = input_data.supplementary_data.column_types['features']
+        feature_type_ids = input_data.supplementary_data.col_type_ids['features']
         categorical_ids, _ = find_categorical_columns(input_data.features,
                                                       feature_type_ids)
 
@@ -59,7 +59,7 @@ def transform(self, input_data: InputData) -> InputData:
         self._apply_encoder(copied_data.features)
 
         # Update features types
-        feature_type_ids = copied_data.supplementary_data.column_types['features']
+        feature_type_ids = copied_data.supplementary_data.col_type_ids['features']
         feature_type_ids[self.binary_ids_to_convert] = TYPE_TO_ID[int]
         return copied_data
 
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index f046306805..19732e2196 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Tuple, Optional, List, Dict, Sequence
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Tuple, Optional, List, Dict
 
 import numpy as np
 import pandas as pd
@@ -84,7 +85,7 @@ def convert_data_for_fit(self, data: InputData):
 
         # And in target(s)
         data.target = self.target_types_converting(target=data.target, task=data.task)
-        data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
+        data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features,
                                                                               target=data.target,
                                                                               task=data.task)
 
@@ -92,8 +93,8 @@ def convert_data_for_fit(self, data: InputData):
         # Launch conversion float and integer features into categorical
         self._into_categorical_features_transformation_for_fit(data)
         # Save info about features and target types
-        self.feature_type_ids = data.supplementary_data.column_types['features'].copy()
-        self.target_type_ids = data.supplementary_data.column_types.get(
+        self.feature_type_ids = data.supplementary_data.col_type_ids['features'].copy()
+        self.target_type_ids = data.supplementary_data.col_type_ids.get(
             'target', np.empty((self.feature_type_ids.shape[0], 1), dtype=float)
         ).copy()
 
@@ -107,7 +108,7 @@ def convert_data_for_predict(self, data: InputData):
         data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
         data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log)
         data.target = apply_type_transformation(data.target, self.target_type_ids, self.log)
-        data.supplementary_data.column_types = self.prepare_column_types_info(predictors=data.features,
+        data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features,
                                                                               target=data.target,
                                                                               task=data.task)
 
@@ -184,15 +185,15 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
 
             data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
 
-            data.supplementary_data.column_types['features'] = np.delete(
-                data.supplementary_data.column_types['features'],
+            data.supplementary_data.col_type_ids['features'] = np.delete(
+                data.supplementary_data.col_type_ids['features'],
                 list(self.string_columns_transformation_failed)
             )
 
-    def _check_columns_vs_types_number(self, table: np.ndarray, column_types: list):
+    def _check_columns_vs_types_number(self, table: np.ndarray, col_type_ids: Sequence):
         # Check if columns number correct
         _, n_cols = table.shape
-        if n_cols != len(column_types):
+        if n_cols != len(col_type_ids):
             # There is an incorrect types calculation
             self.log.warning('Columns number and types numbers do not match.')
 
@@ -277,7 +278,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
         Perform automated categorical features determination. If feature column
         contains int or float values with few unique values (less than 13)
         """
-        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids = data.supplementary_data.col_type_ids['features']
         is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
         numeric_type_ids = np.flatnonzero(is_numeric_type)
         num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
@@ -302,14 +303,14 @@ def _into_categorical_features_transformation_for_predict(self, data: InputData)
         data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
 
         # Update information about column types (in-place)
-        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids = data.supplementary_data.col_type_ids['features']
         feature_type_ids[self.numerical_into_str] = TYPE_TO_ID[str]
 
     def _into_numeric_features_transformation_for_fit(self, data: InputData):
         """
         Automatically determine categorical features which should be converted into float
         """
-        is_str_type = data.supplementary_data.column_types['features'] == TYPE_TO_ID[str]
+        is_str_type = data.supplementary_data.col_type_ids['features'] == TYPE_TO_ID[str]
         str_col_ids = np.flatnonzero(is_str_type)
         str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
         orig_nans_cnt = str_cols_df.isna().sum(axis=0)
@@ -328,7 +329,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData):
         self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
 
         # Update information about column types (in-place)
-        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids = data.supplementary_data.col_type_ids['features']
         feature_type_ids[is_numeric_ids] = TYPE_TO_ID[float]
 
         # The columns consist mostly of truly str values and has a few ints/floats in it
@@ -352,7 +353,7 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
         data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
 
         # Update information about column types (in-place)
-        feature_type_ids = data.supplementary_data.column_types['features']
+        feature_type_ids = data.supplementary_data.col_type_ids['features']
         feature_type_ids[str_col_ids] = TYPE_TO_ID[float]
 
 
@@ -404,14 +405,14 @@ def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) ->
     return frame.loc[:, cols_have_any]
 
 
-def apply_type_transformation(table: np.ndarray, column_types: Sequence, log: LoggerAdapter):
+def apply_type_transformation(table: np.ndarray, col_type_ids: Sequence, log: LoggerAdapter):
     """
     Apply transformation for columns in dataset into desired type. Perform
     transformation on predict stage when column types were already determined
     during fit
     """
     table_df = pd.DataFrame(table, copy=False)
-    types_sr = pd.Series(column_types).map({
+    types_sr = pd.Series(col_type_ids).map({
         **{TYPE_TO_ID[t]: t for t in [int, str]},
         **{TYPE_TO_ID[t]: float for t in [bool, type(None), float]}
     })
@@ -458,27 +459,27 @@ def _generate_list_with_types(columns_types_info: pd.DataFrame,
     :param columns_types_info: dictionary with initial column types
     :param converted_columns: dictionary with transformed column types
     """
-    updated_column_types = []
+    updated_col_type_ids = []
 
     for column_id, column_type_ids in columns_types_info.loc[_TYPES].items():
         if len(column_type_ids) == 1:
             # Column initially contain only one type
-            updated_column_types.append(column_type_ids[0])
+            updated_col_type_ids.append(column_type_ids[0])
         elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids:
             # Column with one type and nans
             filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]]
-            updated_column_types.append(filtered_types[0])
+            updated_col_type_ids.append(filtered_types[0])
         else:
             if TYPE_TO_ID[str] in column_type_ids:
                 # Mixed-types column with string
                 new_col_id = converted_columns[column_id]
                 if new_col_id is not None:
-                    updated_column_types.append(new_col_id)
+                    updated_col_type_ids.append(new_col_id)
             else:
                 # Mixed-types with float and integer
-                updated_column_types.append(TYPE_TO_ID[float])
+                updated_col_type_ids.append(TYPE_TO_ID[float])
 
-    return np.array(updated_column_types)
+    return np.array(updated_col_type_ids)
 
 
 def _process_predict_column_values_one_by_one(value, current_type: type):
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 3ad213c912..6292cc786c 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -402,7 +402,7 @@ def _train_target_encoder(self, data: InputData, source_name: str):
             data: data to be encoded
             source_name: name of the data source node
         """
-        categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.column_types.get('target'))
+        categorical_ids, _ = find_categorical_columns(data.target, data.supplementary_data.col_type_ids.get('target'))
 
         if categorical_ids:
             # Target is categorical
@@ -428,7 +428,7 @@ def _apply_target_encoding(self, data: InputData, source_name: str) -> np.ndarra
         encoded_target = data.target
         if encoder is not None:
             # Target encoders have already been fitted
-            data.supplementary_data.column_types['target'] = np.array([TYPE_TO_ID[int]])
+            data.supplementary_data.col_type_ids['target'] = np.array([TYPE_TO_ID[int]])
             encoded_target = encoder.transform(encoded_target)
             if len(encoded_target.shape) == 1:
                 encoded_target = encoded_target.reshape((-1, 1))
diff --git a/test/unit/data/test_supplementary_data.py b/test/unit/data/test_supplementary_data.py
index 7c768392b5..0a4f9beaa1 100644
--- a/test/unit/data/test_supplementary_data.py
+++ b/test/unit/data/test_supplementary_data.py
@@ -20,14 +20,14 @@ def outputs_table_with_different_types():
     task = Task(TaskTypesEnum.regression)
     idx = [0, 1, 2]
     target = [1, 2, 10]
-    data_info_first = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str], TYPE_TO_ID[float]]),
+    data_info_first = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[str], TYPE_TO_ID[float]]),
                                                       'target': np.array([TYPE_TO_ID[int]])})
     output_first = OutputData(idx=idx, features=None,
                               predict=np.array([['a', 1.1], ['b', 2], ['c', 3]], dtype=object),
                               task=task, target=target, data_type=DataTypesEnum.table,
                               supplementary_data=data_info_first)
 
-    data_info_second = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]]),
+    data_info_second = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[float]]),
                                                        'target': np.array([TYPE_TO_ID[int]])})
     output_second = OutputData(idx=idx, features=None,
                                predict=np.array([[2.5], [2.1], [9.3]], dtype=float),
@@ -118,11 +118,11 @@ def test_define_types_after_merging(outputs_table_with_different_types):
     merged_data = DataMerger.get(outputs).merge()
     updated_info = merged_data.supplementary_data
 
-    feature_type_ids = updated_info.column_types['features']
-    target_type_ids = updated_info.column_types['target']
+    feature_type_ids = updated_info.col_type_ids['features']
+    target_type_ids = updated_info.col_type_ids['target']
 
     # Target type must stay the same
-    ancestor_target_type = outputs[0].supplementary_data.column_types['target'][0]
+    ancestor_target_type = outputs[0].supplementary_data.col_type_ids['target'][0]
     assert target_type_ids[0] == ancestor_target_type
     assert len(feature_type_ids) == 3
     assert tuple(feature_type_ids) == (TYPE_TO_ID[str], TYPE_TO_ID[float], TYPE_TO_ID[float])
diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py
index ad55628231..b5832b1bc1 100644
--- a/test/unit/data_operations/test_data_operations_implementations.py
+++ b/test/unit/data_operations/test_data_operations_implementations.py
@@ -129,7 +129,7 @@ def get_multivariate_time_series(mutli_ts=False):
 
 
 def get_nan_inf_data():
-    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[float]] * 4)})
+    supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[float]] * 4)})
     train_input = InputData(idx=[0, 1, 2, 3],
                             features=np.array([[1, 2, 3, 4],
                                                [2, np.nan, 4, 5],
@@ -144,7 +144,7 @@ def get_nan_inf_data():
 
 
 def get_single_feature_data(task=None):
-    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int]]),
+    supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[int]]),
                                                 'target': np.array([TYPE_TO_ID[int]])})
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
                             features=np.array([[1], [2], [3], [7], [8], [9]]),
@@ -171,7 +171,7 @@ def get_mixed_data(task=None, extended=False):
         feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[int],
                                      TYPE_TO_ID[str], TYPE_TO_ID[str], TYPE_TO_ID[str]])
         target_type_ids = np.array([TYPE_TO_ID[int]])
-        supp_data = SupplementaryData(column_types={'features': feature_type_ids,
+        supp_data = SupplementaryData(col_type_ids={'features': feature_type_ids,
                                                     'target': target_type_ids})
     else:
         features = np.array([[1, '0', 1],
@@ -182,7 +182,7 @@ def get_mixed_data(task=None, extended=False):
                              [9, '0', 0]], dtype=object)
         feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
         target_type_ids = np.array([TYPE_TO_ID[int]])
-        supp_data = SupplementaryData(column_types={'features': feature_type_ids,
+        supp_data = SupplementaryData(col_type_ids={'features': feature_type_ids,
                                                     'target': target_type_ids})
 
     train_input = InputData(idx=[0, 1, 2, 3, 4, 5],
@@ -203,7 +203,7 @@ def get_nan_binary_data(task=None):
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
     feature_type_ids = np.array([TYPE_TO_ID[int], TYPE_TO_ID[str], TYPE_TO_ID[int]])
-    supp_data = SupplementaryData(column_types={'features': feature_type_ids})
+    supp_data = SupplementaryData(col_type_ids={'features': feature_type_ids})
     features = np.array([[1, '0', 0],
                          [np.nan, np.nan, np.nan],
                          [0, '2', 1],
@@ -232,7 +232,7 @@ def get_unbalanced_dataset(size=10, disbalance=0.4, target_dim=None):
     if target_dim == 2:
         target = target.reshape(-1, 1)
 
-    supp_data = SupplementaryData(column_types={
+    supp_data = SupplementaryData(col_type_ids={
         'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[str]]),
         'target': np.array([TYPE_TO_ID[int]])
     })
@@ -253,7 +253,7 @@ def data_with_binary_int_features_and_equal_categories():
     must be processed as "almost categorical". Current dataset
     For example, nan object in [1, nan, 0, 0] must be filled as 0, not as 0.33
     """
-    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[int]])})
+    supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[int], TYPE_TO_ID[int]])})
     task = Task(TaskTypesEnum.classification)
     features = np.array([[1, 10],
                          [np.nan, np.nan],
diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py
index 5726b041df..a4d0e83fd8 100644
--- a/test/unit/preprocessing/test_preprocessing_through_api.py
+++ b/test/unit/preprocessing/test_preprocessing_through_api.py
@@ -11,7 +11,7 @@
 
 def data_with_only_categorical_features():
     """ Generate tabular data with only categorical features. All of them are binary. """
-    supp_data = SupplementaryData(column_types={'features': np.array([TYPE_TO_ID[str]] * 3)})
+    supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[str]] * 3)})
     task = Task(TaskTypesEnum.regression)
     features = np.array([["'a'", "0", "1"],
                          ["'b'", "1", "0"],
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index cd2a95f3da..9cc3027ebb 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -129,8 +129,8 @@ def test_column_types_converting_correctly():
     types_corr = TableTypesCorrector()
     data = types_corr.convert_data_for_fit(data)
 
-    feature_type_ids = data.supplementary_data.column_types['features']
-    target_type_ids = data.supplementary_data.column_types['target']
+    feature_type_ids = data.supplementary_data.col_type_ids['features']
+    target_type_ids = data.supplementary_data.col_type_ids['target']
 
     assert len(feature_type_ids) == 4
     assert len(target_type_ids) == 2
@@ -154,7 +154,7 @@ def test_column_types_process_correctly():
     pipeline.fit(train_data)
     predicted = pipeline.predict(test_data)
 
-    feature_type_ids = predicted.supplementary_data.column_types['features']
+    feature_type_ids = predicted.supplementary_data.col_type_ids['features']
     assert len(feature_type_ids) == predicted.predict.shape[1]
     # All output values are float
     assert (feature_type_ids == TYPE_TO_ID[float]).all()
@@ -262,7 +262,7 @@ def test_str_numbers_with_dots_and_commas_in_predict():
     input_data = InputData(idx=np.arange(4),
                            features=features, target=target, task=task, data_type=DataTypesEnum.table)
 
-    transformed_predict = apply_type_transformation(table=input_data.features, column_types=[TYPE_TO_ID[int]],
+    transformed_predict = apply_type_transformation(table=input_data.features, col_type_ids=[TYPE_TO_ID[int]],
                                                     log=default_log('test_str_numbers_with_dots_and_commas_in_predict'))
 
     assert all(transformed_predict == np.array([[8], [4], [3], [6]]))

From 3fddcc8b5293154f7f4a309d840018b29fe771f8 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 31 Jul 2023 13:59:38 +0300
Subject: [PATCH 53/72] pandas version fix

---
 fedot/preprocessing/data_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 19732e2196..96278bf0b3 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -362,7 +362,7 @@ def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
     types, which column contains.
     """
     table_of_types = pd.DataFrame(table, copy=True)
-    table_of_types = table_of_types.replace(np.nan, None).applymap(lambda el: TYPE_TO_ID[type(el)])
+    table_of_types = table_of_types.replace({np.nan: None}).applymap(lambda el: TYPE_TO_ID[type(el)])
 
     # Build dataframe with unique types for each column
     uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T

From 315ab99d961aea9ca60a57492eaa8a4750858b02 Mon Sep 17 00:00:00 2001
From: Pakulin Sergei <pakylin3@mail.ru>
Date: Mon, 31 Jul 2023 14:00:29 +0300
Subject: [PATCH 54/72] inf condition simplification

---
 fedot/core/data/data_preprocessing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index b5519034c7..c32adf36ff 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -14,7 +14,7 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
 
 def replace_inf_with_nans(input_data: InputData):
     features = input_data.features
-    is_inf = (features == np.inf) | (features == -np.inf)
+    is_inf = np.isin(features, [np.inf, -np.inf])
     if np.any(is_inf):
         features[is_inf] = np.nan
 

From 5240a6c709f2e83967fb3cc6690d093a188fba2c Mon Sep 17 00:00:00 2001
From: Sergei Pakulin <pakylin3@mail.ru>
Date: Tue, 29 Aug 2023 19:18:53 +0500
Subject: [PATCH 55/72] upd gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 961f017444..cdacb38f3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .idea/
+.vscode/
 **/.pytest_cache/
 **/__pycache__/
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
@@ -76,4 +77,4 @@ dist/
 test/unit/test_log.log
 test/unit/catboost_info
 
-local/
\ No newline at end of file
+local/

From 9a770e007b6a1cb97cbb53ed8d5d385e49ba53b4 Mon Sep 17 00:00:00 2001
From: Sergei Pakulin <pakylin3@mail.ru>
Date: Wed, 30 Aug 2023 12:07:55 +0500
Subject: [PATCH 56/72] lint fixes

---
 fedot/api/api_utils/input_analyser.py                     | 6 ++----
 fedot/core/data/data.py                                   | 2 --
 fedot/core/data/data_preprocessing.py                     | 4 ++--
 fedot/core/data/merge/supplementary_data_merger.py        | 2 +-
 fedot/core/data/supplementary_data.py                     | 2 +-
 .../data_operations/categorical_encoders.py               | 1 -
 .../data_operations/sklearn_selectors.py                  | 2 +-
 .../data_operations/sklearn_transformations.py            | 8 ++++----
 .../implementation_interfaces.py                          | 2 +-
 fedot/core/operations/operation.py                        | 3 ++-
 fedot/core/repository/json_evaluation.py                  | 2 +-
 fedot/core/repository/operation_types_repository.py       | 4 ++--
 fedot/preprocessing/base_preprocessing.py                 | 3 ++-
 fedot/preprocessing/categorical.py                        | 2 +-
 fedot/preprocessing/data_types.py                         | 3 ++-
 fedot/preprocessing/dummy_preprocessing.py                | 2 +-
 fedot/preprocessing/preprocessing.py                      | 6 +++---
 test/integration/models/test_repository.py                | 2 +-
 test/unit/preprocessing/test_preprocessors.py             | 6 +++---
 19 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py
index 3b9c5f2c23..d626835741 100644
--- a/fedot/api/api_utils/input_analyser.py
+++ b/fedot/api/api_utils/input_analyser.py
@@ -1,18 +1,16 @@
 from functools import partial
 from inspect import signature
-from typing import Dict, Tuple, Any, Union
+from typing import Any, Dict, Tuple, Union
 
 import numpy as np
 from golem.core.log import default_log
 
-from fedot.core.composer.meta_rules import get_cv_folds_number, get_recommended_preset, \
-    get_early_stopping_generations
+from fedot.core.composer.meta_rules import get_cv_folds_number, get_early_stopping_generations, get_recommended_preset
 from fedot.core.data.data import InputData
 from fedot.core.data.data_preprocessing import find_categorical_columns
 from fedot.core.data.multi_modal import MultiModalData
 from fedot.core.repository.dataset_types import DataTypesEnum
 
-
 meta_rules = [get_cv_folds_number,
               get_recommended_preset,
               get_early_stopping_generations]
diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
index bf06dbf87b..63eac97fd0 100644
--- a/fedot/core/data/data.py
+++ b/fedot/core/data/data.py
@@ -2,14 +2,12 @@
 
 import glob
 import os
-
 from copy import copy, deepcopy
 from dataclasses import dataclass, field
 from typing import Any, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
-
 from golem.core.log import default_log
 from golem.utilities.requirements_notificator import warn_requirement
 
diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index c32adf36ff..dccbe12803 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -1,9 +1,9 @@
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
 import numpy as np
 import pandas as pd
 
-from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
+from fedot.core.data.data import InputData, data_type_is_multi_ts, data_type_is_table, data_type_is_ts
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.preprocessing.data_types import TYPE_TO_ID
 
diff --git a/fedot/core/data/merge/supplementary_data_merger.py b/fedot/core/data/merge/supplementary_data_merger.py
index 6a4c747e4a..6b5c414c46 100644
--- a/fedot/core/data/merge/supplementary_data_merger.py
+++ b/fedot/core/data/merge/supplementary_data_merger.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import Dict, List
 
 import numpy as np
 from golem.core.log import default_log
diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py
index 8a053be9a9..08c5509a6f 100644
--- a/fedot/core/data/supplementary_data.py
+++ b/fedot/core/data/supplementary_data.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Optional, Dict
+from typing import Dict, Optional
 
 import numpy as np
 
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 5d2993417b..0888843268 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -2,7 +2,6 @@
 from typing import List, Optional
 
 import numpy as np
-
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
 from fedot.core.data.data import InputData, OutputData
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
index 8444b4eaf0..fa880ae7fd 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py
@@ -5,7 +5,7 @@
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 
-from fedot.core.data.data import OutputData, InputData
+from fedot.core.data.data import InputData, OutputData
 from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
     DataOperationImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index df87b7b1b5..4037cbc89e 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -3,13 +3,13 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.decomposition import KernelPCA, PCA, FastICA
+from sklearn.decomposition import FastICA, KernelPCA, PCA
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
 
-from fedot.core.data.data import InputData, data_type_is_table, OutputData
-from fedot.core.data.data_preprocessing import replace_inf_with_nans, convert_into_column, \
-    divide_data_categorical_numerical, find_categorical_columns, data_has_categorical_features
+from fedot.core.data.data import InputData, OutputData, data_type_is_table
+from fedot.core.data.data_preprocessing import convert_into_column, data_has_categorical_features, \
+    divide_data_categorical_numerical, find_categorical_columns, replace_inf_with_nans
 from fedot.core.operations.evaluation.operation_implementations. \
     implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py
index 6e4703a6a5..aeb3e44790 100644
--- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py
+++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py
@@ -5,7 +5,7 @@
 import numpy as np
 from golem.core.log import default_log
 
-from fedot.core.data.data import OutputData, InputData
+from fedot.core.data.data import InputData, OutputData
 from fedot.core.operations.operation_parameters import OperationParameters
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.utilities.custom_errors import AbstractMethodNotImplementError
diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py
index 3d4ae4134d..da44065277 100644
--- a/fedot/core/operations/operation.py
+++ b/fedot/core/operations/operation.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Optional, Union, Dict, Any
+from typing import Any, Dict, Optional, Union
 
 from golem.core.log import default_log
 from golem.serializers.serializer import register_serializable
@@ -120,6 +120,7 @@ def predict_for_fit(self, fitted_operation, data: InputData, params: Optional[Op
 
     def _predict(self, fitted_operation, data: InputData, params: Optional[OperationParameters] = None,
                  output_mode: str = 'default', is_fit_stage: bool = False):
+
         is_main_target = data.supplementary_data.is_main_target
         data_flow_length = data.supplementary_data.data_flow_length
         self._init(data.task, output_mode=output_mode, params=params, n_samples_data=data.features.shape[0])
diff --git a/fedot/core/repository/json_evaluation.py b/fedot/core/repository/json_evaluation.py
index ba4483ce0e..9473fd03c4 100644
--- a/fedot/core/repository/json_evaluation.py
+++ b/fedot/core/repository/json_evaluation.py
@@ -1,5 +1,5 @@
 from importlib import import_module
-from typing import Union, TYPE_CHECKING, List
+from typing import List, TYPE_CHECKING, Union
 
 # imports are required beneath in the function
 from fedot.core.repository.dataset_types import DataTypesEnum
diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py
index 7e42d95e60..6555a35242 100644
--- a/fedot/core/repository/operation_types_repository.py
+++ b/fedot/core/repository/operation_types_repository.py
@@ -2,13 +2,13 @@
 import os
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union, TYPE_CHECKING
+from typing import Dict, List, Optional, TYPE_CHECKING, Union
 
 import numpy as np
 from golem.core.log import default_log
 from golem.utilities.data_structures import ensure_wrapped_in_sequence
 
-from fedot.core.constants import BEST_QUALITY_PRESET_NAME, AUTO_PRESET_NAME
+from fedot.core.constants import AUTO_PRESET_NAME, BEST_QUALITY_PRESET_NAME
 from fedot.core.repository.dataset_types import DataTypesEnum
 from fedot.core.repository.json_evaluation import import_enums_from_str, import_strategy_from_str, read_field
 from fedot.core.repository.tasks import Task, TaskTypesEnum
diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
index ae0ef29140..4c9de6cf5c 100644
--- a/fedot/preprocessing/base_preprocessing.py
+++ b/fedot/preprocessing/base_preprocessing.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Dict, List, Union, TYPE_CHECKING
+from typing import Dict, Union, TYPE_CHECKING
 
 import numpy as np
 from sklearn.preprocessing import LabelEncoder
@@ -168,6 +168,7 @@ def convert_indexes_for_predict(self, pipeline, data: Union[InputData, MultiModa
     def restore_index(self, input_data: InputData, result: OutputData) -> OutputData:
         """
         restores index from ``input_data`` into ``result``
+
         Args:
             input_data: data to take the index from
             result: data to store index into
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index c0ea6913eb..5cde088d7a 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -6,7 +6,7 @@
 
 from fedot.core.data.data import InputData
 from fedot.core.data.data_preprocessing import find_categorical_columns
-from fedot.preprocessing.data_types import TYPE_TO_ID, FEDOT_STR_NAN
+from fedot.preprocessing.data_types import FEDOT_STR_NAN, TYPE_TO_ID
 
 
 class BinaryCategoricalPreprocessor:
diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py
index 96278bf0b3..32d5f7e323 100644
--- a/fedot/preprocessing/data_types.py
+++ b/fedot/preprocessing/data_types.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Tuple, Optional, List, Dict
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
+
 from golem.core.log import LoggerAdapter, default_log
 
 from fedot.core.repository.tasks import Task, TaskTypesEnum
diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py
index 36b76a390c..d3c4206e34 100644
--- a/fedot/preprocessing/dummy_preprocessing.py
+++ b/fedot/preprocessing/dummy_preprocessing.py
@@ -1,4 +1,4 @@
-from typing import Union, TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
 
 import numpy as np
 from golem.core.log import default_log
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 6292cc786c..95985539d0 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -1,5 +1,5 @@
 from copy import copy
-from typing import Union, Optional
+from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -8,7 +8,7 @@
 from sklearn.preprocessing import LabelEncoder
 
 from fedot.core.data.data import InputData, np_datetime_to_numeric
-from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_ts, data_type_is_text
+from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts
 from fedot.core.data.data_preprocessing import (
     data_has_categorical_features,
     data_has_missing_values,
@@ -28,7 +28,7 @@
 from fedot.core.repository.tasks import TaskTypesEnum
 from fedot.preprocessing.base_preprocessing import BasePreprocessor
 from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor
-from fedot.preprocessing.data_type_check import exclude_ts, exclude_multi_ts, exclude_image
+from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts
 from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector
 from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer
 
diff --git a/test/integration/models/test_repository.py b/test/integration/models/test_repository.py
index 39eee3afd1..fd63299531 100644
--- a/test/integration/models/test_repository.py
+++ b/test/integration/models/test_repository.py
@@ -7,7 +7,7 @@
 from fedot.core.repository.operation_types_repository import (OperationTypesRepository,
                                                               get_operation_type_from_id)
 from fedot.core.repository.pipeline_operation_repository import PipelineOperationRepository
-from fedot.core.repository.tasks import TaskTypesEnum, Task
+from fedot.core.repository.tasks import Task, TaskTypesEnum
 
 
 def mocked_path():
diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py
index 9cc3027ebb..856f59f40d 100644
--- a/test/unit/preprocessing/test_preprocessors.py
+++ b/test/unit/preprocessing/test_preprocessors.py
@@ -7,13 +7,13 @@
 from fedot.core.pipelines.node import PipelineNode
 from fedot.core.pipelines.pipeline import Pipeline
 from fedot.core.repository.dataset_types import DataTypesEnum
-from fedot.core.repository.tasks import TaskTypesEnum, Task
+from fedot.core.repository.tasks import Task, TaskTypesEnum
 from fedot.core.utils import fedot_project_root
 from fedot.preprocessing.data_types import TYPE_TO_ID
 from fedot.preprocessing.data_types import TableTypesCorrector, apply_type_transformation
 from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME
-from test.unit.preprocessing.test_pipeline_preprocessing import data_with_mixed_types_in_each_column, \
-    correct_preprocessing_params
+from test.unit.preprocessing.test_pipeline_preprocessing import correct_preprocessing_params, \
+    data_with_mixed_types_in_each_column
 
 
 def get_mixed_data_with_str_and_float_values(idx: int = None):

From 8c793bbe99f019c3d6def1c07b7c6444b4d09eb5 Mon Sep 17 00:00:00 2001
From: Sergei Pakulin <pakylin3@mail.ru>
Date: Fri, 1 Sep 2023 16:24:39 +0500
Subject: [PATCH 57/72] typings

---
 .../data_operations/categorical_encoders.py          | 12 ++++++------
 fedot/core/operations/operation.py                   |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
index 0888843268..dce9296c12 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -22,10 +22,10 @@ def __init__(self, params: Optional[OperationParameters] = None):
             'handle_unknown': 'ignore'
         }
         self.encoder = OneHotEncoder(**{**default_params, **self.params.to_dict()})
-        self.categorical_ids = None
-        self.non_categorical_ids = None
-        self.encoded_ids = None
-        self.new_numerical_idx = None
+        self.categorical_ids: List[int] = []
+        self.non_categorical_ids: List[int] = []
+        self.encoded_ids: List[int] = []
+        self.new_numerical_idx: List[int] = []
 
     def fit(self, input_data: InputData):
         """ Method for fit encoder with automatic determination of categorical features
@@ -104,8 +104,8 @@ def __init__(self, params: Optional[OperationParameters] = None):
         super().__init__(params)
         # LabelEncoder has no parameters
         self.encoders = {}
-        self.categorical_ids: List[int] = None
-        self.non_categorical_ids: List[int] = None
+        self.categorical_ids: List[int] = []
+        self.non_categorical_ids: List[int] = []
 
     def fit(self, input_data: InputData):
         feature_type_ids = input_data.supplementary_data.col_type_ids['features']
diff --git a/fedot/core/operations/operation.py b/fedot/core/operations/operation.py
index da44065277..3625425c7c 100644
--- a/fedot/core/operations/operation.py
+++ b/fedot/core/operations/operation.py
@@ -26,7 +26,7 @@ def __init__(self, operation_type: str, **kwargs):
         self.operation_type = operation_type
 
         self._eval_strategy = None
-        self.operations_repo: OperationTypesRepository = None
+        self.operations_repo: Optional[OperationTypesRepository] = None
         self.fitted_operation = None
 
         self.log = default_log(self)

From ac1a57724037dc9edc37045c139063415f0d262d Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Mon, 27 Nov 2023 17:19:59 +0300
Subject: [PATCH 58/72] Adding preprocessing data at once from API

---
 fedot/api/api_utils/api_data.py       | 54 +++++++++++++++++++++++++++
 fedot/api/main.py                     | 10 ++++-
 fedot/core/data/supplementary_data.py |  2 +
 fedot/core/pipelines/pipeline.py      | 12 ++++--
 fedot/core/utils.py                   | 11 ++++++
 5 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py
index ff447fe41a..880d64dc92 100644
--- a/fedot/api/api_utils/api_data.py
+++ b/fedot/api/api_utils/api_data.py
@@ -1,7 +1,10 @@
+import sys
+from datetime import datetime
 from typing import Dict, Union
 from typing import Optional
 
 import numpy as np
+from golem.core.log import default_log
 
 from fedot.api.api_utils.data_definition import data_strategy_selector, FeaturesType, TargetType
 from fedot.core.data.data import InputData, OutputData, data_type_is_table
@@ -10,6 +13,7 @@
 from fedot.core.pipelines.pipeline import Pipeline
 from fedot.core.pipelines.ts_wrappers import in_sample_ts_forecast, convert_forecast_to_output
 from fedot.core.repository.tasks import Task, TaskTypesEnum
+from fedot.core.utils import convert_memory_size
 from fedot.preprocessing.dummy_preprocessing import DummyPreprocessor
 from fedot.preprocessing.preprocessing import DataPreprocessor
 
@@ -39,6 +43,8 @@ def __init__(self, task: Task, use_input_preprocessing: bool = True):
             self._recommendations = {'cut': self.preprocessor.cut_dataset,
                                      'label_encoded': self.preprocessor.label_encoding_for_fit}
 
+        self.log = default_log(self)
+
     def define_data(self,
                     features: FeaturesType,
                     target: Optional[TargetType] = None,
@@ -123,3 +129,51 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod
             for name, rec in recommendations.items():
                 # Apply desired preprocessing function
                 self._recommendations[name](input_data, *rec.values())
+
+    def fit_transform(self, train_data: InputData) -> InputData:
+        start_time = datetime.now()
+        self.log.message('Preprocessing data')
+        memory_usage = convert_memory_size(sys.getsizeof(train_data))
+        features_shape = train_data.features.shape
+        target_shape = train_data.target.shape
+        self.log.message(
+            f'Train Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}')
+
+        train_data = self.preprocessor.obligatory_prepare_for_fit(data=train_data)
+        train_data = self.preprocessor.optional_prepare_for_fit(pipeline=Pipeline(), data=train_data)
+        train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data)
+        train_data.supplementary_data.is_auto_preprocessed = True
+
+        memory_usage = convert_memory_size(sys.getsizeof(train_data))
+        features_shape = train_data.features.shape
+        target_shape = train_data.target.shape
+        self.log.message(
+            f'Train Data (Processed) Memory Usage: {memory_usage} Data Shape: {features_shape, target_shape}')
+        self.log.message(f'Data preprocessing runtime = {datetime.now() - start_time}')
+
+        return train_data
+
+    def transform(self, test_data: InputData) -> InputData:
+        start_time = datetime.now()
+        self.log.message('Preprocessing data')
+        memory_usage = convert_memory_size(sys.getsizeof(test_data))
+        features_shape = test_data.features.shape
+        target_shape = test_data.target.shape
+        self.log.message(
+            f'Test Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}')
+
+        test_data = self.preprocessor.obligatory_prepare_for_predict(data=test_data)
+        test_data = self.preprocessor.optional_prepare_for_predict(pipeline=Pipeline(), data=test_data)
+        test_data = self.preprocessor.convert_indexes_for_predict(pipeline=Pipeline(), data=test_data)
+        test_data = self.preprocessor.update_indices_for_time_series(test_data)
+        test_data.supplementary_data.is_auto_preprocessed = True
+
+        memory_usage = convert_memory_size(sys.getsizeof(test_data))
+        features_shape = test_data.features.shape
+        target_shape = test_data.target.shape
+        self.log.message(
+            f'Test Data (Processed) Memory Usage: {memory_usage} Data Shape: {features_shape, target_shape}')
+        self.log.message(f'Data preprocessing runtime = {datetime.now() - start_time}')
+
+        return test_data
+
diff --git a/fedot/api/main.py b/fedot/api/main.py
index f1e6718adc..5b7fed8e17 100644
--- a/fedot/api/main.py
+++ b/fedot/api/main.py
@@ -86,6 +86,7 @@ def __init__(self,
                  logging_level: int = logging.ERROR,
                  safe_mode: bool = False,
                  n_jobs: int = -1,
+                 auto_preprocessing: bool = False,
                  **composer_tuner_params
                  ):
 
@@ -101,6 +102,7 @@ def __init__(self,
 
         self.api_composer = ApiComposer(self.params, self.metrics)
 
+        self.auto_preprocessing = auto_preprocessing
         # Initialize data processors for data preprocessing and preliminary data analysis
         self.data_processor = ApiDataProcessor(task=self.params.task,
                                                use_input_preprocessing=self.params.get('use_input_preprocessing'))
@@ -156,6 +158,9 @@ def fit(self,
 
         self._init_remote_if_necessary()
 
+        if self.auto_preprocessing:
+            self.train_data = self.data_processor.fit_transform(self.train_data)
+
         if predefined_model is not None:
             # Fit predefined model and return it without composing
             self.current_pipeline = PredefinedModel(predefined_model, self.train_data, self.log,
@@ -258,6 +263,9 @@ def predict(self,
         self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True)
         self._is_in_sample_prediction = in_sample
 
+        if self.auto_preprocessing:
+            self.test_data = self.data_processor.transform(self.test_data)
+
         self.prediction = self.data_processor.define_predictions(current_pipeline=self.current_pipeline,
                                                                  test_data=self.test_data,
                                                                  in_sample=self._is_in_sample_prediction,
@@ -521,4 +529,4 @@ def _train_pipeline_on_full_dataset(self, recommendations: Optional[dict],
         self.current_pipeline.fit(
             full_train_not_preprocessed,
             n_jobs=self.params.n_jobs
-        )
+        )
\ No newline at end of file
diff --git a/fedot/core/data/supplementary_data.py b/fedot/core/data/supplementary_data.py
index 08c5509a6f..77943a28e6 100644
--- a/fedot/core/data/supplementary_data.py
+++ b/fedot/core/data/supplementary_data.py
@@ -29,6 +29,8 @@ class SupplementaryData:
     non_int_idx: Optional[list] = None
     # Dictionary with features and target column type numeric identificators
     col_type_ids: Optional[Dict[str, np.ndarray]] = None
+    # Was the data preprocessed before composer
+    is_auto_preprocessed: bool = False
 
     @property
     def compound_mask(self):
diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py
index c5a727108c..9ea6f503e1 100644
--- a/fedot/core/pipelines/pipeline.py
+++ b/fedot/core/pipelines/pipeline.py
@@ -184,7 +184,10 @@ def fit(self, input_data: Union[InputData, MultiModalData],
         """
         self.replace_n_jobs_in_nodes(n_jobs)
 
-        copied_input_data = self._preprocess(input_data)
+        if input_data.supplementary_data.is_auto_preprocessed:
+            copied_input_data = deepcopy(input_data)
+        else:
+            copied_input_data = self._preprocess(input_data)
 
         copied_input_data = self._assign_data_to_nodes(copied_input_data)
         if time_constraint is None:
@@ -268,8 +271,11 @@ def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str
             self.log.error(ex)
             raise ValueError(ex)
 
-        # Make copy of the input data to avoid performing inplace operations
-        copied_input_data = self._preprocess(input_data, is_fit_stage=False)
+        if input_data.supplementary_data.is_auto_preprocessed:
+            copied_input_data = deepcopy(input_data)
+        else:
+            # Make copy of the input data to avoid performing inplace operations
+            copied_input_data = self._preprocess(input_data, is_fit_stage=False)
 
         copied_input_data = self._assign_data_to_nodes(copied_input_data)
         result = self.root_node.predict(input_data=copied_input_data, output_mode=output_mode)
diff --git a/fedot/core/utils.py b/fedot/core/utils.py
index 044e5b2446..dd87cdc431 100644
--- a/fedot/core/utils.py
+++ b/fedot/core/utils.py
@@ -1,3 +1,4 @@
+import math
 import os
 import platform
 import random
@@ -131,3 +132,13 @@ def df_to_html(df: pd.DataFrame, save_path: Union[str, os.PathLike], name: str =
     if table.parent.name != 'div':
         table = table.wrap(doc.new_tag('div', style='overflow: auto;'))
         file.write_text(doc.prettify())
+
+
+def convert_memory_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
\ No newline at end of file

From 3084851826957ecf641c0545101fcad7270c1215 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Mon, 27 Nov 2023 21:20:07 +0300
Subject: [PATCH 59/72] Fixes in params, data preprocessor merging and fixes in
 tests

---
 fedot/api/api_utils/api_data.py           | 10 +++---
 fedot/api/main.py                         | 15 ++++-----
 fedot/core/data/data_preprocessing.py     | 11 ++++---
 fedot/preprocessing/base_preprocessing.py | 37 ++++++++++++++++++-----
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py
index 880d64dc92..0dbf39036f 100644
--- a/fedot/api/api_utils/api_data.py
+++ b/fedot/api/api_utils/api_data.py
@@ -133,7 +133,7 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod
     def fit_transform(self, train_data: InputData) -> InputData:
         start_time = datetime.now()
         self.log.message('Preprocessing data')
-        memory_usage = convert_memory_size(sys.getsizeof(train_data))
+        memory_usage = convert_memory_size(sys.getsizeof(train_data.features))
         features_shape = train_data.features.shape
         target_shape = train_data.target.shape
         self.log.message(
@@ -144,7 +144,7 @@ def fit_transform(self, train_data: InputData) -> InputData:
         train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data)
         train_data.supplementary_data.is_auto_preprocessed = True
 
-        memory_usage = convert_memory_size(sys.getsizeof(train_data))
+        memory_usage = convert_memory_size(sys.getsizeof(train_data.features))
         features_shape = train_data.features.shape
         target_shape = train_data.target.shape
         self.log.message(
@@ -153,7 +153,7 @@ def fit_transform(self, train_data: InputData) -> InputData:
 
         return train_data
 
-    def transform(self, test_data: InputData) -> InputData:
+    def transform(self, test_data: InputData, current_pipeline) -> InputData:
         start_time = datetime.now()
         self.log.message('Preprocessing data')
         memory_usage = convert_memory_size(sys.getsizeof(test_data))
@@ -163,8 +163,8 @@ def transform(self, test_data: InputData) -> InputData:
             f'Test Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}')
 
         test_data = self.preprocessor.obligatory_prepare_for_predict(data=test_data)
-        test_data = self.preprocessor.optional_prepare_for_predict(pipeline=Pipeline(), data=test_data)
-        test_data = self.preprocessor.convert_indexes_for_predict(pipeline=Pipeline(), data=test_data)
+        test_data = self.preprocessor.optional_prepare_for_predict(pipeline=current_pipeline, data=test_data)
+        test_data = self.preprocessor.convert_indexes_for_predict(pipeline=current_pipeline, data=test_data)
         test_data = self.preprocessor.update_indices_for_time_series(test_data)
         test_data.supplementary_data.is_auto_preprocessed = True
 
diff --git a/fedot/api/main.py b/fedot/api/main.py
index 5b7fed8e17..7627c462a7 100644
--- a/fedot/api/main.py
+++ b/fedot/api/main.py
@@ -86,7 +86,6 @@ def __init__(self,
                  logging_level: int = logging.ERROR,
                  safe_mode: bool = False,
                  n_jobs: int = -1,
-                 auto_preprocessing: bool = False,
                  **composer_tuner_params
                  ):
 
@@ -102,7 +101,6 @@ def __init__(self,
 
         self.api_composer = ApiComposer(self.params, self.metrics)
 
-        self.auto_preprocessing = auto_preprocessing
         # Initialize data processors for data preprocessing and preliminary data analysis
         self.data_processor = ApiDataProcessor(task=self.params.task,
                                                use_input_preprocessing=self.params.get('use_input_preprocessing'))
@@ -158,7 +156,7 @@ def fit(self,
 
         self._init_remote_if_necessary()
 
-        if self.auto_preprocessing:
+        if self.params.get('use_input_preprocessing'):
             self.train_data = self.data_processor.fit_transform(self.train_data)
 
         if predefined_model is not None:
@@ -180,9 +178,12 @@ def fit(self,
             else:
                 self.log.message('Already fitted initial pipeline is used')
 
-        # Store data encoder in the pipeline if it is required
+        # Merge API & pipelines encoders if it is required
         self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors(
-            self.data_processor.preprocessor, self.current_pipeline.preprocessor)
+            api_preprocessor=self.data_processor.preprocessor,
+            pipeline_preprocessor=self.current_pipeline.preprocessor,
+            use_input_preprocessing=self.params.get('use_input_preprocessing')
+        )
 
         self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}')
 
@@ -263,8 +264,8 @@ def predict(self,
         self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True)
         self._is_in_sample_prediction = in_sample
 
-        if self.auto_preprocessing:
-            self.test_data = self.data_processor.transform(self.test_data)
+        if self.params.get('use_input_preprocessing'):
+            self.test_data = self.data_processor.transform(self.test_data, self.current_pipeline)
 
         self.prediction = self.data_processor.define_predictions(current_pipeline=self.current_pipeline,
                                                                  test_data=self.test_data,
diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index dccbe12803..c8f9fd383a 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -98,15 +98,16 @@ def data_has_categorical_features(data: InputData) -> bool:
     if data.data_type is not DataTypesEnum.table:
         return False
 
-    column_type_ids = data.supplementary_data.col_type_ids['features']
-    cat_ids, non_cat_ids = find_categorical_columns(data.features, column_type_ids)
-    data_has_categorical_columns = len(cat_ids) > 0
+    feature_type_ids = data.supplementary_data.col_type_ids['features']
+    cat_ids, non_cat_ids = find_categorical_columns(data.features, feature_type_ids)
 
     data.numerical_idx = non_cat_ids
     data.categorical_idx = cat_ids
-    data.categorical_features = data.subset_features(cat_ids).features
 
-    return data_has_categorical_columns
+    if len(cat_ids) > 0:
+        data.categorical_features = data.subset_features(cat_ids).features
+
+    return bool(cat_ids)
 
 
 def data_has_text_features(data: InputData) -> bool:
diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
index 4c9de6cf5c..df728fa1f3 100644
--- a/fedot/preprocessing/base_preprocessing.py
+++ b/fedot/preprocessing/base_preprocessing.py
@@ -211,7 +211,9 @@ def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligator
 
     @staticmethod
     def merge_preprocessors(api_preprocessor: 'BasePreprocessor',
-                            pipeline_preprocessor: 'BasePreprocessor') -> 'BasePreprocessor':
+                            pipeline_preprocessor: 'BasePreprocessor',
+                            use_input_preprocessing: bool,
+                            ) -> 'BasePreprocessor':
         """
         Combines two preprocessor's objects.
 
@@ -222,11 +224,32 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor',
         Returns:
             merged preprocessor
         """
-        # Take all obligatory data preprocessing from API
-        new_data_preprocessor = api_preprocessor
+        # If was used auto preprocessor
+        if use_input_preprocessing:
+            # Take all obligatory data preprocessing from obtained pipelines
+            new_data_preprocessor = pipeline_preprocessor
+
+            # Update optional preprocessing (take it from API preprocessor)
+            if not new_data_preprocessor.features_encoders:
+                # Store features encoder from API preprocessor because there are no encoding in obtained pipelines
+                new_data_preprocessor.features_encoders = api_preprocessor.features_encoders
+
+            if not new_data_preprocessor.features_imputers:
+                # Same with Nan's imputers
+                new_data_preprocessor.features_imputers = api_preprocessor.features_imputers
+
+        # If was used pipelines preprocessors
+        else:
+            # Take all obligatory data preprocessing from API
+            new_data_preprocessor = api_preprocessor
+
+            # Update optional preprocessing (take it from obtained pipeline)
+            if not new_data_preprocessor.features_encoders:
+                # Store features encoder from obtained pipeline because in API there are no encoding
+                new_data_preprocessor.features_encoders = pipeline_preprocessor.features_encoders
+
+            if not new_data_preprocessor.features_imputers:
+                # Same with Nan's imputers
+                new_data_preprocessor.features_imputers = pipeline_preprocessor.features_imputers
 
-        # Update optional preprocessing (take it from obtained pipeline)
-        if not new_data_preprocessor.features_encoders:
-            # Store features encoder from obtained pipeline because in API there are no encoding
-            new_data_preprocessor.features_encoders = pipeline_preprocessor.features_encoders
         return new_data_preprocessor

From adaf590e99985d53c926143345c92a00d2d7f57f Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Tue, 28 Nov 2023 15:26:02 +0300
Subject: [PATCH 60/72] Fixes for MultiModalData

---
 fedot/core/pipelines/pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py
index 9ea6f503e1..9f874f4fcc 100644
--- a/fedot/core/pipelines/pipeline.py
+++ b/fedot/core/pipelines/pipeline.py
@@ -184,7 +184,7 @@ def fit(self, input_data: Union[InputData, MultiModalData],
         """
         self.replace_n_jobs_in_nodes(n_jobs)
 
-        if input_data.supplementary_data.is_auto_preprocessed:
+        if isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed:
             copied_input_data = deepcopy(input_data)
         else:
             copied_input_data = self._preprocess(input_data)
@@ -271,7 +271,7 @@ def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str
             self.log.error(ex)
             raise ValueError(ex)
 
-        if input_data.supplementary_data.is_auto_preprocessed:
+        if isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed:
             copied_input_data = deepcopy(input_data)
         else:
             # Make copy of the input data to avoid performing inplace operations

From 097c1633bec48f704518e7cc6deff33fa5fc7401 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Tue, 28 Nov 2023 16:34:54 +0300
Subject: [PATCH 61/72] Added new api param, fix in merge, fixes & editing
 tests

---
 fedot/api/api_utils/api_params_repository.py  |  3 +-
 fedot/api/main.py                             |  8 ++--
 fedot/preprocessing/base_preprocessing.py     | 18 ++++-----
 .../test_preprocessing_through_api.py         | 40 ++++++++++++++-----
 4 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py
index d71e300685..b2ca5d612d 100644
--- a/fedot/api/api_utils/api_params_repository.py
+++ b/fedot/api/api_utils/api_params_repository.py
@@ -18,7 +18,7 @@ class ApiParamsRepository:
 
     COMPOSER_REQUIREMENTS_KEYS = {'max_arity', 'max_depth', 'num_of_generations',
                                   'early_stopping_iterations', 'early_stopping_timeout',
-                                  'parallelization_mode', 'use_input_preprocessing',
+                                  'parallelization_mode', 'use_input_preprocessing', 'use_auto_preprocessing',
                                   'show_progress', 'collect_intermediate_metric', 'keep_n_best',
                                   'keep_history', 'history_dir', 'cv_folds'}
 
@@ -62,6 +62,7 @@ def default_params_for_task(task_type: TaskTypesEnum) -> dict:
             use_pipelines_cache=True,
             use_preprocessing_cache=True,
             use_input_preprocessing=True,
+            use_auto_preprocessing=False,
             use_meta_rules=False,
             cache_dir=default_fedot_data_dir(),
             keep_history=True,
diff --git a/fedot/api/main.py b/fedot/api/main.py
index 7627c462a7..b28d3dc8c7 100644
--- a/fedot/api/main.py
+++ b/fedot/api/main.py
@@ -142,7 +142,7 @@ def fit(self,
         self.train_data = self.data_processor.define_data(features=features, target=target, is_predict=False)
         self.params.update_available_operations_by_preset(self.train_data)
 
-        if self.params.get('use_input_preprocessing'):
+        if self.params.get('use_auto_preprocessing'):
             # Launch data analyser - it gives recommendations for data preprocessing
             recommendations_for_data, recommendations_for_params = \
                 self.data_analyser.give_recommendations(input_data=self.train_data,
@@ -156,7 +156,7 @@ def fit(self,
 
         self._init_remote_if_necessary()
 
-        if self.params.get('use_input_preprocessing'):
+        if isinstance(self.train_data, InputData) and self.params.get('use_auto_preprocessing'):
             self.train_data = self.data_processor.fit_transform(self.train_data)
 
         if predefined_model is not None:
@@ -182,7 +182,7 @@ def fit(self,
         self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors(
             api_preprocessor=self.data_processor.preprocessor,
             pipeline_preprocessor=self.current_pipeline.preprocessor,
-            use_input_preprocessing=self.params.get('use_input_preprocessing')
+            use_input_preprocessing=self.params.get('use_auto_preprocessing')
         )
 
         self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}')
@@ -264,7 +264,7 @@ def predict(self,
         self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True)
         self._is_in_sample_prediction = in_sample
 
-        if self.params.get('use_input_preprocessing'):
+        if isinstance(self.test_data, InputData) and self.params.get('use_auto_preprocessing'):
             self.test_data = self.data_processor.transform(self.test_data, self.current_pipeline)
 
         self.prediction = self.data_processor.define_predictions(current_pipeline=self.current_pipeline,
diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
index df728fa1f3..556cff6182 100644
--- a/fedot/preprocessing/base_preprocessing.py
+++ b/fedot/preprocessing/base_preprocessing.py
@@ -227,16 +227,16 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor',
         # If was used auto preprocessor
         if use_input_preprocessing:
             # Take all obligatory data preprocessing from obtained pipelines
-            new_data_preprocessor = pipeline_preprocessor
-
-            # Update optional preprocessing (take it from API preprocessor)
-            if not new_data_preprocessor.features_encoders:
-                # Store features encoder from API preprocessor because there are no encoding in obtained pipelines
-                new_data_preprocessor.features_encoders = api_preprocessor.features_encoders
+            new_data_preprocessor = api_preprocessor
 
-            if not new_data_preprocessor.features_imputers:
-                # Same with Nan's imputers
-                new_data_preprocessor.features_imputers = api_preprocessor.features_imputers
+            # # Update optional preprocessing (take it from API preprocessor)
+            # if not new_data_preprocessor.features_encoders:
+            #     # Store features encoder from API preprocessor because there are no encoding in obtained pipelines
+            #     new_data_preprocessor.features_encoders = api_preprocessor.features_encoders
+            #
+            # if not new_data_preprocessor.features_imputers:
+            #     # Same with Nan's imputers
+            #     new_data_preprocessor.features_imputers = api_preprocessor.features_imputers
 
         # If was used pipelines preprocessors
         else:
diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py
index a4d0e83fd8..6e42ee0975 100644
--- a/test/unit/preprocessing/test_preprocessing_through_api.py
+++ b/test/unit/preprocessing/test_preprocessing_through_api.py
@@ -16,7 +16,7 @@ def data_with_only_categorical_features():
     features = np.array([["'a'", "0", "1"],
                          ["'b'", "1", "0"],
                          ["'c'", "1", "0"]], dtype=object)
-    input_data = InputData(idx=[0, 1, 2], features=features,
+    input_data = InputData(idx=np.array([0, 1, 2]), features=features,
                            target=np.array([0, 1, 2]),
                            task=task, data_type=DataTypesEnum.table,
                            supplementary_data=supp_data)
@@ -41,7 +41,7 @@ def data_with_too_much_nans():
                          [9, '1', np.inf],
                          [8, np.nan, np.inf]], dtype=object)
     target = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10]])
-    train_input = InputData(idx=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.table,
                             supplementary_data=SupplementaryData())
 
@@ -61,7 +61,7 @@ def data_with_spaces_and_nans_in_features():
                          ['0  ', '  1'],
                          ['1 ', '  0']], dtype=object)
     target = np.array([[0], [1], [2], [3], [4], [5]])
-    train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.table,
                             supplementary_data=SupplementaryData())
 
@@ -78,7 +78,7 @@ def data_with_nans_in_target_column():
                          [3, 4],
                          [1, 3]])
     target = np.array([[0], [1], [np.nan], [np.nan], [4], [5]])
-    train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.table,
                             supplementary_data=SupplementaryData())
 
@@ -98,7 +98,7 @@ def data_with_nans_in_multi_target():
                          [3, 4],
                          [1, 3]])
     target = np.array([[0, 2], [1, 3], [np.nan, np.nan], [3, np.nan], [4, 4], [5, 6]])
-    train_input = InputData(idx=[0, 1, 2, 3, 4, 5], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3, 4, 5]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.table,
                             supplementary_data=SupplementaryData())
 
@@ -123,7 +123,7 @@ def data_with_categorical_target(with_nan: bool = False):
         target = np.array(['blue', np.nan, np.nan, 'di'], dtype=object)
     else:
         target = np.array(['blue', 'da', 'ba', 'di'], dtype=str)
-    train_input = InputData(idx=[0, 1, 2, 3], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.table,
                             supplementary_data=SupplementaryData())
 
@@ -140,7 +140,7 @@ def data_with_text_features():
                         dtype=object)
 
     target = np.array([[0], [1], [0], [1]])
-    train_input = InputData(idx=[0, 1, 2, 3], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.text,
                             supplementary_data=SupplementaryData())
 
@@ -159,7 +159,7 @@ def data_with_pseudo_text_features():
                         dtype=object)
 
     target = np.array([[0], [1], [0], [1], [0]])
-    train_input = InputData(idx=[0, 1, 2, 3, 4], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3, 4]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.table,
                             supplementary_data=SupplementaryData())
 
@@ -177,7 +177,7 @@ def data_with_text_features_and_nans():
                         dtype=object)
 
     target = np.array([[0], [1], [0], [1], [0]])
-    train_input = InputData(idx=[0, 1, 2, 3, 4], features=features,
+    train_input = InputData(idx=np.array([0, 1, 2, 3, 4]), features=features,
                             target=target, task=task, data_type=DataTypesEnum.text,
                             supplementary_data=SupplementaryData())
 
@@ -250,3 +250,25 @@ def test_correct_api_dataset_with_pseudo_text_preprocessing():
     node_tags = [node.tags for node in fedot_model.current_pipeline.nodes]
     assert not any('text' in current_tags for current_tags in node_tags)
     assert fedot_model.prediction.features.shape[0] == input_data.features.shape[0]
+
+
+def test_auto_preprocessing_mode():
+    funcs = [data_with_only_categorical_features, data_with_too_much_nans,
+             data_with_spaces_and_nans_in_features, data_with_nans_in_target_column,
+             data_with_nans_in_multi_target]
+
+    # Check for all datasets
+    for data_generator in funcs:
+        input_data = data_generator()
+        single_processing = Fedot(problem='regression', use_auto_preprocessing=True)
+        multi_processing = Fedot(problem='regression', use_auto_preprocessing=False)
+
+        pipeline_single = single_processing.fit(input_data, predefined_model='auto')
+        pipeline_multi = multi_processing.fit(input_data, predefined_model='auto')
+
+        prediction_single = pipeline_single.predict(input_data)
+        prediction_multi = pipeline_multi.predict(input_data)
+
+        assert prediction_single.features.shape == prediction_multi.features.shape
+        assert (prediction_single.features == prediction_single.features).all()
+        assert (prediction_single.predict == prediction_single.predict).all()

From 94b6af56c6db138deee3191a93983f06ebcb09e6 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Wed, 29 Nov 2023 20:15:15 +0300
Subject: [PATCH 62/72] Fix param for test

---
 fedot/api/api_utils/api_params_repository.py | 2 +-
 fedot/api/builder.py                         | 2 ++
 test/unit/api/test_api_params.py             | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py
index b2ca5d612d..e1626db0b1 100644
--- a/fedot/api/api_utils/api_params_repository.py
+++ b/fedot/api/api_utils/api_params_repository.py
@@ -18,7 +18,7 @@ class ApiParamsRepository:
 
     COMPOSER_REQUIREMENTS_KEYS = {'max_arity', 'max_depth', 'num_of_generations',
                                   'early_stopping_iterations', 'early_stopping_timeout',
-                                  'parallelization_mode', 'use_input_preprocessing', 'use_auto_preprocessing',
+                                  'parallelization_mode', 'use_input_preprocessing',
                                   'show_progress', 'collect_intermediate_metric', 'keep_n_best',
                                   'keep_history', 'history_dir', 'cv_folds'}
 
diff --git a/fedot/api/builder.py b/fedot/api/builder.py
index 449b076833..1a7b7bdf15 100644
--- a/fedot/api/builder.py
+++ b/fedot/api/builder.py
@@ -330,6 +330,7 @@ def setup_data_preprocessing(
             safe_mode: bool = DEFAULT_VALUE,
             use_input_preprocessing: bool = DEFAULT_VALUE,
             use_preprocessing_cache: bool = DEFAULT_VALUE,
+            use_auto_preprocessing: bool = DEFAULT_VALUE,
     ) -> FedotBuilder:
         """ Sets parameters of input data preprocessing.
 
@@ -351,6 +352,7 @@ def setup_data_preprocessing(
             safe_mode=safe_mode,
             use_input_preprocessing=use_input_preprocessing,
             use_preprocessing_cache=use_preprocessing_cache,
+            use_auto_preprocessing=use_auto_preprocessing,
         )
         return self
 
diff --git a/test/unit/api/test_api_params.py b/test/unit/api/test_api_params.py
index 7295ababa9..f19e96b0d0 100644
--- a/test/unit/api/test_api_params.py
+++ b/test/unit/api/test_api_params.py
@@ -35,6 +35,7 @@
                          use_pipelines_cache=True,
                          use_preprocessing_cache=True,
                          use_input_preprocessing=True,
+                         use_auto_preprocessing=False,
                          cache_dir='cache',
                          keep_history=True,
                          history_dir='history',

From be007cb768d6887c7b07d7b03e416ee77d1045bb Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Wed, 29 Nov 2023 20:34:30 +0300
Subject: [PATCH 63/72] Fix bug in API

---
 fedot/api/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/api/main.py b/fedot/api/main.py
index b28d3dc8c7..a9c42ec1f5 100644
--- a/fedot/api/main.py
+++ b/fedot/api/main.py
@@ -142,7 +142,7 @@ def fit(self,
         self.train_data = self.data_processor.define_data(features=features, target=target, is_predict=False)
         self.params.update_available_operations_by_preset(self.train_data)
 
-        if self.params.get('use_auto_preprocessing'):
+        if self.params.get('use_input_preprocessing'):
             # Launch data analyser - it gives recommendations for data preprocessing
             recommendations_for_data, recommendations_for_params = \
                 self.data_analyser.give_recommendations(input_data=self.train_data,

From 8e046f51abfe2d761b4ca6a2faec1d5de18768af Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Tue, 5 Dec 2023 14:54:09 +0300
Subject: [PATCH 64/72] @kasyanovse requested improvements

---
 fedot/core/data/data_preprocessing.py | 12 +++---------
 fedot/preprocessing/preprocessing.py  |  6 ++----
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index c8f9fd383a..b0076e9f0a 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -72,15 +72,9 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda
 
 def force_categorical_determination(table: np.ndarray):
     """ Find string columns using 'computationally expensive' approach """
-    categorical_ids = []
-    non_categorical_ids = []
-    # For every column in table make check
-    for column_id, column in enumerate(table.T):
-        # Check if column is of string objects
-        if pd.api.types.infer_dtype(column, skipna=True) == 'string':
-            categorical_ids.append(column_id)
-        else:
-            non_categorical_ids.append(column_id)
+    real_columns_selector = np.all(np.isreal(table), axis=0)
+    non_categorical_ids = np.flatnonzero(real_columns_selector).tolist()
+    categorical_ids = np.flatnonzero(~real_columns_selector).tolist()
 
     return categorical_ids, non_categorical_ids
 
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 95985539d0..b45e2ceabc 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -314,11 +314,9 @@ def _clean_extra_spaces(data: InputData) -> InputData:
         """
 
         def strip_all_strs(item: Union[object, str]):
-            try:
+            if isinstance(item, str):
                 return item.strip()
-            except AttributeError:
-                # not an str object
-                return item
+            return item
 
         features_df = pd.DataFrame(data.features)
         mixed_or_str = features_df.select_dtypes(object)

From cac26f691c002909f599127b69eeac8ea44db946 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Tue, 5 Dec 2023 17:08:42 +0300
Subject: [PATCH 65/72] Return fixes

---
 fedot/core/data/data_preprocessing.py | 14 +++++++++-----
 fedot/preprocessing/preprocessing.py  |  6 ++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index b0076e9f0a..49650e71c7 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -72,11 +72,15 @@ def find_categorical_columns(table: np.ndarray, column_type_ids: Optional[np.nda
 
 def force_categorical_determination(table: np.ndarray):
     """ Find string columns using 'computationally expensive' approach """
-    real_columns_selector = np.all(np.isreal(table), axis=0)
-    non_categorical_ids = np.flatnonzero(real_columns_selector).tolist()
-    categorical_ids = np.flatnonzero(~real_columns_selector).tolist()
-
-    return categorical_ids, non_categorical_ids
+    categorical_ids = []
+    non_categorical_ids = []
+    # For every column in table make check
+    for column_id, column in enumerate(table.T):
+        # Check if column is of string objects
+        if pd.api.types.infer_dtype(column, skipna=True) == 'string':
+            categorical_ids.append(column_id)
+        else:
+            non_categorical_ids.append(column_id)
 
 
 def data_has_missing_values(data: InputData) -> bool:
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index b45e2ceabc..95985539d0 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -314,9 +314,11 @@ def _clean_extra_spaces(data: InputData) -> InputData:
         """
 
         def strip_all_strs(item: Union[object, str]):
-            if isinstance(item, str):
+            try:
                 return item.strip()
-            return item
+            except AttributeError:
+                # not an str object
+                return item
 
         features_df = pd.DataFrame(data.features)
         mixed_or_str = features_df.select_dtypes(object)

From 5f62ef408834fed5f4a387f2b71ac61aae2346a2 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Wed, 6 Dec 2023 15:06:06 +0300
Subject: [PATCH 66/72] Return fixes (1)

---
 fedot/core/data/data_preprocessing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
index 49650e71c7..c8f9fd383a 100644
--- a/fedot/core/data/data_preprocessing.py
+++ b/fedot/core/data/data_preprocessing.py
@@ -82,6 +82,8 @@ def force_categorical_determination(table: np.ndarray):
         else:
             non_categorical_ids.append(column_id)
 
+    return categorical_ids, non_categorical_ids
+
 
 def data_has_missing_values(data: InputData) -> bool:
     """ Check data for missing values."""

From b96214849444dea767df69bceb4c4588cea180c9 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Thu, 7 Dec 2023 14:09:19 +0300
Subject: [PATCH 67/72] Remove transformations to str categories

---
 fedot/api/api_utils/input_analyser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py
index d626835741..3be6ecc3f9 100644
--- a/fedot/api/api_utils/input_analyser.py
+++ b/fedot/api/api_utils/input_analyser.py
@@ -115,5 +115,5 @@ def control_categorical(self, input_data: InputData) -> bool:
         """
 
         categorical_ids, _ = find_categorical_columns(input_data.features)
-        uniques = np.unique(input_data.features[:, categorical_ids].astype(str))
+        uniques = np.unique(input_data.features[:, categorical_ids])
         return len(uniques) > self.max_cat_cardinality

From d5b06482e9bac215c30c60f7a55642be452d00e4 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Thu, 7 Dec 2023 14:27:57 +0300
Subject: [PATCH 68/72] Return transformations to str for categories

---
 fedot/api/api_utils/input_analyser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py
index 3be6ecc3f9..d626835741 100644
--- a/fedot/api/api_utils/input_analyser.py
+++ b/fedot/api/api_utils/input_analyser.py
@@ -115,5 +115,5 @@ def control_categorical(self, input_data: InputData) -> bool:
         """
 
         categorical_ids, _ = find_categorical_columns(input_data.features)
-        uniques = np.unique(input_data.features[:, categorical_ids])
+        uniques = np.unique(input_data.features[:, categorical_ids].astype(str))
         return len(uniques) > self.max_cat_cardinality

From 8be836e9b545affbdd8ad9cfa834869fa0691e3e Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Thu, 7 Dec 2023 17:01:36 +0300
Subject: [PATCH 69/72] Fix control_categorical for label encoder

---
 fedot/api/api_utils/input_analyser.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py
index d626835741..25dd01fa64 100644
--- a/fedot/api/api_utils/input_analyser.py
+++ b/fedot/api/api_utils/input_analyser.py
@@ -77,6 +77,7 @@ def _give_recommendations_for_data(self, input_data: InputData) -> Dict:
                 recommendations_for_data['cut'] = {'border': border}
             is_label_encoding_needed = self.control_categorical(input_data)
             if is_label_encoding_needed:
+                self._log('Switch categorical encoder to label encoder')
                 recommendations_for_data['label_encoded'] = {}
         return recommendations_for_data
 
@@ -115,5 +116,6 @@ def control_categorical(self, input_data: InputData) -> bool:
         """
 
         categorical_ids, _ = find_categorical_columns(input_data.features)
-        uniques = np.unique(input_data.features[:, categorical_ids].astype(str))
-        return len(uniques) > self.max_cat_cardinality
+        # Counts unique categories for each feature, and then counts their number
+        uniques_cats = sum([len(np.unique(feature)) for feature in input_data.features[:, categorical_ids].astype(str)])
+        return uniques_cats > self.max_cat_cardinality

From a91c9ba7ee2b2fc58ecd2bdb57ee076857fa4539 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Thu, 7 Dec 2023 17:13:05 +0300
Subject: [PATCH 70/72] Fix log message

---
 fedot/api/api_utils/input_analyser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py
index 25dd01fa64..61f91770e5 100644
--- a/fedot/api/api_utils/input_analyser.py
+++ b/fedot/api/api_utils/input_analyser.py
@@ -77,7 +77,7 @@ def _give_recommendations_for_data(self, input_data: InputData) -> Dict:
                 recommendations_for_data['cut'] = {'border': border}
             is_label_encoding_needed = self.control_categorical(input_data)
             if is_label_encoding_needed:
-                self._log('Switch categorical encoder to label encoder')
+                self._log.info('Switch categorical encoder to label encoder')
                 recommendations_for_data['label_encoded'] = {}
         return recommendations_for_data
 

From 304e29fb0d2d62d029ade22bb0ddeca5af2ac479 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Fri, 8 Dec 2023 23:01:12 +0300
Subject: [PATCH 71/72] Small fixes with merger

---
 fedot/api/main.py                         |  2 +-
 fedot/preprocessing/base_preprocessing.py | 13 ++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/fedot/api/main.py b/fedot/api/main.py
index a9c42ec1f5..81bbb460da 100644
--- a/fedot/api/main.py
+++ b/fedot/api/main.py
@@ -182,7 +182,7 @@ def fit(self,
         self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors(
             api_preprocessor=self.data_processor.preprocessor,
             pipeline_preprocessor=self.current_pipeline.preprocessor,
-            use_input_preprocessing=self.params.get('use_auto_preprocessing')
+            use_auto_preprocessing=self.params.get('use_auto_preprocessing')
         )
 
         self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}')
diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
index 556cff6182..7871af8fc4 100644
--- a/fedot/preprocessing/base_preprocessing.py
+++ b/fedot/preprocessing/base_preprocessing.py
@@ -212,7 +212,7 @@ def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligator
     @staticmethod
     def merge_preprocessors(api_preprocessor: 'BasePreprocessor',
                             pipeline_preprocessor: 'BasePreprocessor',
-                            use_input_preprocessing: bool,
+                            use_auto_preprocessing: bool,
                             ) -> 'BasePreprocessor':
         """
         Combines two preprocessor's objects.
@@ -225,19 +225,10 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor',
             merged preprocessor
         """
         # If was used auto preprocessor
-        if use_input_preprocessing:
+        if use_auto_preprocessing:
             # Take all obligatory data preprocessing from obtained pipelines
             new_data_preprocessor = api_preprocessor
 
-            # # Update optional preprocessing (take it from API preprocessor)
-            # if not new_data_preprocessor.features_encoders:
-            #     # Store features encoder from API preprocessor because there are no encoding in obtained pipelines
-            #     new_data_preprocessor.features_encoders = api_preprocessor.features_encoders
-            #
-            # if not new_data_preprocessor.features_imputers:
-            #     # Same with Nan's imputers
-            #     new_data_preprocessor.features_imputers = api_preprocessor.features_imputers
-
         # If was used pipelines preprocessors
         else:
             # Take all obligatory data preprocessing from API

From 0c48f7ff945c825c9af59c86b51e9d3e3eff5a4b Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Mon, 11 Dec 2023 19:52:06 +0300
Subject: [PATCH 72/72] @andreygetmanov requested fixes

---
 fedot/core/utils.py                  | 10 +++++-----
 fedot/preprocessing/preprocessing.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fedot/core/utils.py b/fedot/core/utils.py
index dd87cdc431..8e1654e7c9 100644
--- a/fedot/core/utils.py
+++ b/fedot/core/utils.py
@@ -137,8 +137,8 @@ def df_to_html(df: pd.DataFrame, save_path: Union[str, os.PathLike], name: str =
 def convert_memory_size(size_bytes):
     if size_bytes == 0:
         return "0B"
-    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
-    i = int(math.floor(math.log(size_bytes, 1024)))
-    p = math.pow(1024, i)
-    s = round(size_bytes / p, 2)
-    return "%s %s" % (s, size_name[i])
\ No newline at end of file
+    digit_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    integer_size_value = int(math.floor(math.log(size_bytes, 1024)))
+    byte_digit = math.pow(1024, integer_size_value)
+    size_in_digit_name = round(size_bytes / byte_digit, 2)
+    return "%s %s" % (size_in_digit_name, digit_name[integer_size_value])
\ No newline at end of file
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 95985539d0..ac1c165fb4 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -317,7 +317,7 @@ def strip_all_strs(item: Union[object, str]):
             try:
                 return item.strip()
             except AttributeError:
-                # not an str object
+                # not a str object
                 return item
 
         features_df = pd.DataFrame(data.features)