From 1ef601540834f763322c9a3b96f2eed0e7ce5806 Mon Sep 17 00:00:00 2001 From: marcuspappik Date: Fri, 21 Apr 2017 15:19:42 +0200 Subject: [PATCH] wont talk about that --- hics/incremental_correlation.py | 6 +- hics/scored_slices.py | 101 ++++++++++++++------------------ hics/slice_similarity.py | 16 +++-- tests/test_scored_slices.py | 4 +- tests/test_slice_similarity.py | 8 +-- 5 files changed, 66 insertions(+), 69 deletions(-) diff --git a/hics/incremental_correlation.py b/hics/incremental_correlation.py index 7444d88..c673ad2 100644 --- a/hics/incremental_correlation.py +++ b/hics/incremental_correlation.py @@ -43,12 +43,12 @@ def _update_redundancy_table(self, new_weights, new_redundancies): def _update_slices(self, new_slices): current_slices = self.result_storage.get_slices() - for feature_set, new_slics in new_slices.items(): + for feature_set, slices_to_add in new_slices.items(): if not feature_set in current_slices: - current_slices[feature_set] = new_slices[feature_set] + current_slices[feature_set] = slices_to_add else: - current_slices[feature_set].add_slices(new_slices[feature_set]) + current_slices[feature_set].add_slices(slices_to_add) current_slices[feature_set].reduce_slices() diff --git a/hics/scored_slices.py b/hics/scored_slices.py index 524485c..da691de 100644 --- a/hics/scored_slices.py +++ b/hics/scored_slices.py @@ -6,11 +6,11 @@ class ScoredSlices: def __init__(self, categorical, continuous, to_keep = 5, threshold = None): - self.continuous = pd.Panel({feature : pd.DataFrame(columns = ['to_value', 'from_value']) - for feature in continuous}) + self.continuous = {feature : pd.DataFrame(columns = ['to_value', 'from_value']) + for feature in continuous} - self.categorical = pd.Panel({feature['name'] : pd.DataFrame(columns = feature['values']) - for feature in categorical}) + self.categorical = {feature['name'] : pd.DataFrame(columns = feature['values']) + for feature in categorical} self.scores = pd.Series() self.to_keep = to_keep @@ -28,43 +28,33 @@ def add_slices(self, slices): self.add_from_object(slices) def add_from_object(self, slices): - temp_continuous = {} - temp_categorical = {} - self.scores = self.scores.append(pd.Series(slices.scores)).sort_values(ascending = False, inplace = False) - for feature, df in slices.continuous.iteritems(): - temp_continuous[feature] = pd.concat([self.continuous[feature], df], ignore_index = True) - temp_continuous[feature] = temp_continuous[feature].loc[self.scores.index, :].reset_index(drop = True) + for feature, df in slices.continuous.items(): + self.continuous[feature] = pd.concat([self.continuous[feature], df], ignore_index = True) + self.continuous[feature] = self.continuous[feature].loc[self.scores.index, :].reset_index(drop = True) - for feature, df in slices.categorical.iteritems(): - temp_categorical[feature] = pd.concat([self.categorical[feature], df], ignore_index = True) - temp_categorical[feature] = temp_categorical[feature].loc[self.scores.index, :].reset_index(drop = True) + for feature, df in slices.categorical.items(): + self.categorical[feature] = pd.concat([self.categorical[feature], df], ignore_index = True) + self.categorical[feature] = self.categorical[feature].loc[self.scores.index, :].reset_index(drop = True) self.scores.reset_index(drop = True, inplace = True) - self.continuous = pd.Panel(temp_continuous) - self.categorical = pd.Panel(temp_categorical) def add_from_dict(self, slices): - temp_continuous = {} - temp_categorical = {} - new_scores = pd.Series(slices['scores']) self.scores = self.scores.append(new_scores, ignore_index = True).sort_values(ascending = False, inplace = False) - for feature in self.continuous.items.values: + for feature in self.continuous: content = pd.DataFrame(slices['features'][feature]) - temp_continuous[feature] = pd.concat([self.continuous[feature], content], ignore_index = True) - temp_continuous[feature] = temp_continuous[feature].loc[self.scores.index, :].reset_index(drop = True) + self.continuous[feature] = pd.concat([self.continuous[feature], content], ignore_index = True) + self.continuous[feature] = self.continuous[feature].loc[self.scores.index, :].reset_index(drop=True) - for feature in self.categorical.items.values: - content = pd.DataFrame(slices['features'][feature], columns = self.categorical[feature].columns.values) - temp_categorical[feature] = pd.concat([self.categorical[feature], content], ignore_index = True) - temp_categorical[feature] = temp_categorical[feature].loc[self.scores.index, :].reset_index(drop = True) + for feature in self.categorical: + content = pd.DataFrame(slices['features'][feature], columns = self.categorical[feature].columns) + self.categorical[feature] = pd.concat([self.categorical[feature], content], ignore_index=True) + self.categorical[feature] = self.categorical[feature].loc[self.scores.index, :].reset_index(drop=True) - self.scores.reset_index(drop = True, inplace = True) - self.continuous = pd.Panel(temp_continuous) - self.categorical = pd.Panel(temp_categorical) + self.scores.reset_index(drop=True, inplace=True) def select_slices(self, similarity): indices = list(range(len(similarity))) @@ -82,12 +72,12 @@ def select_slices(self, similarity): return selected def reduce_slices(self): - if not self.continuous.empty: + if self.continuous: continuous_similarity = continuous_similarity_matrix(self.continuous) else: continuous_similarity = np.ones((len(self.scores), len(self.scores))) - if not self.categorical.empty: + if self.categorical: categorical_similarity = categorical_similarity_matrix(self.categorical) else: categorical_similarity = np.ones((len(self.scores), len(self.scores))) @@ -96,21 +86,19 @@ def reduce_slices(self): selected = self.select_slices(similarity) - if not self.categorical.empty: - self.categorical = self.categorical[:, selected, :] - self.categorical = pd.Panel({name : content.reset_index(drop = True) - for name, content in self.categorical.iteritems()}) + if self.categorical: + self.categorical = {key : df.loc[selected, :].reset_index(drop=True) + for key, df in self.categorical.items()} - if not self.continuous.empty: - self.continuous = self.continuous[:, selected, :] - self.continuous = pd.Panel({name : content.reset_index(drop = True) - for name, content in self.continuous.iteritems()}) + if self.continuous: + self.continuous = {key : df.loc[selected, :].reset_index(drop=True) + for key, df in self.continuous.items()} self.scores = self.scores.loc[selected].reset_index(drop = True) def to_dict(self): - continuous_dict = {name : df.to_dict(orient='list') for name, df in self.continuous.iteritems()} - categorical_dict = {name : df.to_dict(orient='list') for name, df in self.categorical.iteritems()} + continuous_dict = {name : df.to_dict(orient='list') for name, df in self.continuous.items()} + categorical_dict = {name : df.to_dict(orient='list') for name, df in self.categorical.items()} scores_list = self.scores.tolist() return {'continuous' : continuous_dict, 'categorical' : categorical_dict, 'scores' : scores_list, 'to_keep' : self.to_keep, 'threshold' : self.threshold} @@ -120,16 +108,17 @@ def to_output(self, name_mapping=None): result = [] for index, value in self.scores.iteritems(): - current_result = {'deviation' : value, 'features' : {}} - - if len(self.continuous.keys()) > 0: - for feature, values in self.continuous.major_xs(index).iteritems(): - current_result['features'][name_mapping(feature)] = values.to_dict() - - if len(self.categorical.keys()) > 0: - for feature, values in self.categorical.major_xs(index).iteritems(): - current_result['features'][name_mapping(feature)] = list(values.index[values == 1]) - result.append(current_result) + current_result = {'deviation' : value, 'features' : {}} + + if self.continuous: + for feature, df in self.continuous.items(): + current_result['features'][name_mapping(feature)] = df.loc[index, :].to_dict() + + if self.categorical: + for feature, df in self.categorical.items(): + selected_values = df.columns[df.loc[index, :] == 1].tolist() + current_result['features'][name_mapping(feature)] = selected_values + result.append(current_result) return result @staticmethod @@ -138,15 +127,15 @@ def default_threshold(dimensions): @staticmethod def from_dict(dictionary): - continuous_panel = pd.Panel({name : pd.DataFrame(description) - for name, description in dictionary['continuous'].items()}) - categorical_panel = pd.Panel({name : pd.DataFrame(description) - for name, description in dictionary['categorical'].items()}) + continuous = {name : pd.DataFrame(description) + for name, description in dictionary['continuous'].items()} + categorical = {name : pd.DataFrame(description) + for name, description in dictionary['categorical'].items()} scores_series = pd.Series(dictionary['scores']) slices = ScoredSlices([], [], to_keep = dictionary['to_keep'], threshold = dictionary['threshold']) - slices.categorical = categorical_panel - slices.continuous = continuous_panel + slices.categorical = categorical + slices.continuous = continuous slices.scores = scores_series return slices diff --git a/hics/slice_similarity.py b/hics/slice_similarity.py index 473ccae..601b675 100644 --- a/hics/slice_similarity.py +++ b/hics/slice_similarity.py @@ -3,11 +3,15 @@ def continuous_similarity_matrix(dfs): - length = len(dfs.major_axis) + if dfs: + length = len(list(dfs.values())[0]) + else: + length = 0 + volumn = np.zeros((length, length)) overlap = np.zeros((length, length)) - for index, df in dfs.iteritems(): + for index, df in dfs.items(): end = np.array([df['to_value']] * length) start = np.array([df['from_value']] * length) min_range = np.minimum((end - start), (end - start).T) @@ -33,11 +37,15 @@ def continuous_similarity_matrix(dfs): def categorical_similarity_matrix(dfs): - length = len(dfs.major_axis) + if dfs: + length = len(list(dfs.values())[0]) + else: + length = 0 + volumn = np.zeros((length, length)) overlap = np.zeros((length, length)) - for index, df in dfs.iteritems(): + for index, df in dfs.items(): data_array = np.array([np.array(df).tolist()] * len(df)) size_array = np.apply_along_axis(lambda x: (x*1).sum(), 2, data_array) diff --git a/tests/test_scored_slices.py b/tests/test_scored_slices.py index 56096aa..3828b8a 100644 --- a/tests/test_scored_slices.py +++ b/tests/test_scored_slices.py @@ -39,8 +39,8 @@ def test_to_output(self): ft_cat_2 = pd.DataFrame({'1' : [1], '2' : [0]}) ft_con_1 = pd.DataFrame({'from' : [2], 'to' : [3]}) ft_con_2 = pd.DataFrame({'from' : [8], 'to' : [9]}) - continuous = pd.Panel({'ft_con_1' : ft_con_1, 'ft_con_2' : ft_con_2}) - categorical = pd.Panel({'ft_cat_1' : ft_cat_1, 'ft_cat_2' : ft_cat_2}) + continuous = {'ft_con_1' : ft_con_1, 'ft_con_2' : ft_con_2} + categorical = {'ft_cat_1' : ft_cat_1, 'ft_cat_2' : ft_cat_2} scores = pd.Series([2.5]) scored_slices = ScoredSlices([], [], 2, 0.1) diff --git a/tests/test_slice_similarity.py b/tests/test_slice_similarity.py index 9aa8b98..20087e9 100644 --- a/tests/test_slice_similarity.py +++ b/tests/test_slice_similarity.py @@ -8,10 +8,10 @@ class Test_slice_similarity(TestCase): def test_categorical(self): result = np.array([[1, 0.5, 0.5], [0.5, 1, 0.25], [0.5, 0.25, 1]]) - categorical = pd.Panel({ + categorical = { 'X3' : pd.DataFrame({'a' : [1, 0, 1], 'b' : [1, 0, 0], 'c' : [1, 1, 1], 'd' : [0, 1, 0]}), 'X4' : pd.DataFrame({'a' : [1, 1, 0], 'b' : [1, 0, 0], 'c' : [1, 1, 1], 'd' : [0, 0, 1]}) - }) + } similarity = categorical_similarity_matrix(categorical) self.assertTrue(np.all(similarity == result)) @@ -19,10 +19,10 @@ def test_categorical(self): def test_continuous(self): result = np.array([[1, 0, 0], [0, 1, 2/3], [0, 2/3, 1]]) - continuous = pd.Panel({ + continuous = { 'X3' : pd.DataFrame({'from_value' : [0.5, 0, 0], 'to_value' : [1, 0.5, 0.75]}), 'X4' : pd.DataFrame({'from_value' : [0.5, 0, 0], 'to_value' : [1, 0.75, 0.5]}) - }) + } similarity = continuous_similarity_matrix(continuous) self.assertTrue(np.all(similarity == result))