-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathselect_subgroup.py
119 lines (79 loc) · 3.3 KB
/
select_subgroup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
import pandas as pd
def select_subgroup(description=None, df=None, descriptives=None):
pairs = list(description.items())
num_atts = descriptives['num_atts']
bin_atts = descriptives['bin_atts']
nom_atts = descriptives['nom_atts']
ord_atts = descriptives['ord_atts']
if len(pairs) == 0:
idx = []
else:
# select all indices as a starting point
idx = df.index.values
for pair in pairs:
att = pair[0]
sel = pair[1]
if att in bin_atts:
idx = select_bin_att(idx=idx, att=att, sel=sel, df=df)
elif att in num_atts:
idx = select_num_att(idx=idx, att=att, sel=sel, df=df)
elif att in nom_atts:
idx = select_nom_att(idx=idx, att=att, sel=sel, df=df)
elif att in ord_atts:
idx = select_ord_att(idx=idx, att=att, sel=sel, df=df)
all_idx = df.index.values
idx_compl = np.setdiff1d(all_idx, idx)
# this should be loc!!
# make sure the dataset is sorted at the beginning of the algorithm
subgroup = df.loc[idx]
subgroup_compl = df.loc[idx_compl]
#print(list(idx_sg))
#print(list(idx_compl))
return subgroup, list(idx), subgroup_compl, list(idx_compl)
def select_bin_att(idx=None, att=None, sel=None, df=None):
idx_in = idx.copy()
if 'NaN' == sel[0]:
sel_idx = df[df[att].isnull()].index.values
else:
sel_idx = df[df[att] == sel[0]].index.values
idx_out = np.intersect1d(idx_in, sel_idx)
return idx_out
def select_num_att(idx=None, att=None, sel=None, df=None):
idx_in = idx.copy()
if 'NaN' == sel[0]:
sel_idx = df[df[att].isnull()].index.values
else:
low_idx = df[df[att] >= sel[0]].index.values # value can be equal to the lower bound
up_idx = df[df[att] <= sel[1]].index.values # value can be equal to the upper bound
sel_idx = np.intersect1d(low_idx, up_idx)
idx_out = np.intersect1d(idx_in, sel_idx)
return idx_out
def select_nom_att(idx=None, att=None, sel=None, df=None):
idx_in = idx.copy()
tup = sel
if 'NaN' == sel[0]:
sel_idx = df[df[att].isnull()].index.values
idx_in = np.intersect1d(idx_in, sel_idx)
# when the first value is a 1, then take all datapoints with the value in position two
if tup[0] == 1.0:
df_drop = df[df[att].notnull()]
sel_idx = df_drop[df_drop[att] == tup[1]].index.values
idx_in = np.intersect1d(idx_in, sel_idx)
# when the first value is a 0, then take all datapoints that do not have the value in position two
elif tup[0] == 0.0:
df_drop = df[df[att].notnull()]
sel_idx = df_drop[df_drop[att] != tup[1]].index.values
idx_in = np.intersect1d(idx_in, sel_idx)
idx_out = idx_in.copy()
return idx_out
def select_ord_att(idx=None, att=None, sel=None, df=None):
idx_in = idx.copy()
if 'NaN' == sel[0]:
sel_idx = df[df[att].isnull()].index.values
else:
# a loop over all categories in the description
# cases that equal either of the categories should be selected
sel_idx = df[df[att].isin(sel)].index.values
idx_out = np.intersect1d(idx_in, sel_idx)
return idx_out