Skip to content

Commit

Permalink
simplified data_types.py
Browse files Browse the repository at this point in the history
  • Loading branch information
IIaKyJIuH committed Apr 24, 2023
1 parent ff1023e commit 29d27a2
Showing 1 changed file with 9 additions and 20 deletions.
29 changes: 9 additions & 20 deletions fedot/preprocessing/data_types.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from copy import copy
from typing import TYPE_CHECKING, Tuple
from typing import TYPE_CHECKING, Tuple, Optional, Dict

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -128,17 +128,12 @@ def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
table = np.delete(table, self.columns_to_del, 1)
return table

def features_types_converting(self, features: np.ndarray) -> np.array:
def features_types_converting(self, features: np.ndarray) -> np.ndarray:
""" Convert all elements in the data in every feature column into one type
:param features: tabular features array
"""
features_with_mixed_types = find_mixed_types_columns(self.features_columns_info)

if not features_with_mixed_types:
return features

# There are mixed-types columns in features table - convert them
for mixed_column_id in features_with_mixed_types:
column_info = self.features_columns_info[mixed_column_id]

Expand Down Expand Up @@ -309,7 +304,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData):
"""
features_types = data.supplementary_data.column_types['features']
is_numeric_type = np.isin(features_types, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
numeric_type_ids = np.nonzero(is_numeric_type)[0]
numeric_type_ids = np.flatnonzero(is_numeric_type)
num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
nuniques = num_df.nunique(dropna=True)
# reduce dataframe to include only categorical features
Expand Down Expand Up @@ -401,24 +396,16 @@ def _into_numeric_features_transformation_for_predict(self, data: InputData):
features_types[column_id] = TYPE_TO_ID[float]


def define_column_types(table: np.ndarray):
def define_column_types(table: Optional[np.ndarray]) -> Dict:
""" Prepare information about types per columns. For each column store unique
types, which column contains. If column with mixed type contain str object
additional field 'str_ids' with indices of string objects is prepared
"""
if table is None:
return {}

#df_of_types = pd.DataFrame(table_of_types).transform()
nans = pd.isna(table)
table_of_types = np.empty_like(table, dtype=np.int8)
table_of_types[~nans] = [
TYPE_TO_ID[type(x.item() if isinstance(x, (np.ndarray, np.generic)) else x)]
for x in table[~nans]
]
table_of_types[nans] = TYPE_TO_ID[type(None)]

table_of_types = pd.DataFrame(table_of_types)
table_of_types = pd.DataFrame(table, copy=True)
table_of_types = table_of_types.applymap(lambda el: TYPE_TO_ID[type(None if pd.isna(el) else el)]).astype(np.int8)

# Build dataframe with unique types for each column
uniques = table_of_types.apply([pd.unique]).rename(index={'unique': 'types'})
Expand All @@ -440,7 +427,9 @@ def define_column_types(table: np.ndarray):
)

# Build dataframe with nans indices
nans_ids = pd.DataFrame(nans).apply(np.where).rename(index={0: 'nan_ids'})
nans_ids = (table_of_types == TYPE_TO_ID[type(None)]).apply(np.where).rename(index={0: 'nan_ids'})

# Combine all dataframes
return pd.concat([uniques, types_counts, nans_ids]).to_dict()


Expand Down

0 comments on commit 29d27a2

Please sign in to comment.