forked from biolab/wellbeing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
p_values.py
136 lines (107 loc) · 5.11 KB
/
p_values.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from Orange.data import Table
from Orange.preprocess.score import UnivariateLinearRegression, RReliefF
from Orange.regression.random_forest import RandomForestRegressionLearner
from fss import relief_top_attributes, linear_top_attributes, rf_top_attributes
def get_ranker_scores(ranker, data):
"""
shuffle Y 100x times and calculate 'random' ranking scores on the same data.
we will then check how good our real calculated scores are, compared to these 'random' scores.
"""
rank_scores = []
for x in range(100):
np.random.shuffle(data.Y)
scores = ranker(data) # obtain ranker scores for randomized target variable column
for score in scores:
rank_scores.append(score)
rank_scores_clean = [score for score in rank_scores if not np.isnan(score)] # x != 0.0
print(len(rank_scores_clean))
return rank_scores_clean
def get_forest_scores(data):
"""
shuffle Y 100x times and calculate 'random' ranking scores on the same data.
we will then check how good our real calculated scores are, compared to these 'random' scores.
"""
rank_scores = []
for x in range(100):
np.random.shuffle(data.Y)
method = RandomForestRegressionLearner(n_estimators=100, min_samples_split=5, random_state=0)
scores = method.score(data)[0]
for score in scores:
rank_scores.append(score)
rank_scores_clean = [score for score in rank_scores if not np.isnan(score)] # x != 0.0
print(len(rank_scores_clean))
return rank_scores_clean
def plot_dist(scores):
plt.figure(figsize=(14,16))
sns.histplot(data=scores).set(title="regresijski srečkovič")
plt.show()
def calculate_p_value(distribution, top_score):
"""
cnt = count the number of values in the distribution that are bigger than sample
p_val = cnt / len(distribution)
"""
cnt = 0
for score in distribution:
if score >= top_score:
cnt += 1
p_val = cnt / len(distribution)
return p_val
def get_ranker_p_values(random_scores, top_factors):
"""
creating a dictionary, in which key are names of factors and values are their p-values.
"""
p_values = {}
for score, att_name in top_factors:
p_val = calculate_p_value(random_scores, score)
p_values[att_name] = p_val
return p_values
def get_p_values_for_top_factors(data):
# for each ranking method get top attributes.
relief_top_factors = relief_top_attributes(data)
linear_top_factors = linear_top_attributes(data)
random_top_factors = rf_top_attributes(data)
all_top_factors = [relief_top_factors, linear_top_factors, random_top_factors]
# for each ranking method, generate 'random' ranking scores.
ranker_scores = []
for ranker in [RReliefF(random_state=0), UnivariateLinearRegression()]:
ranker_scores.append(get_ranker_scores(ranker, data))
ranker_scores.append(get_forest_scores(data))
rank_method_names = ['R', 'L', 'F'] # create a list of initials for ranking methods.
slovarek = {} # hold final p value for each attribute name.
# combine together a list of random values, list of top factors and the name of ranking method
for random_scores, top_factors, method_name in zip(ranker_scores, all_top_factors, rank_method_names):
p_values = get_ranker_p_values(random_scores, top_factors) # dobim slovar [att_name] --> att p-value
# we iterate through every attribute name and its corresponding p-value.
# for each one we check:
# if 'slovarek' does not yet hold a value this attribute name, we add it.
# if 'slovarek' already holds a value for this attribute name, we overwrite it if the current p value is better then the existing one.
for att_name, p_val in p_values.items():
if att_name not in slovarek.keys():
slovarek[att_name] = (p_val, method_name) # dobim slovar [att_name] --> (p value, method_name)
else:
obstojeci_p_val, obstojeca_metoda = slovarek[att_name]
if p_val < obstojeci_p_val:
print(f'Pri {att_name} prepisujem {obstojeci_p_val} od {obstojeca_metoda} z {p_val} od {method_name}')
slovarek[att_name] = (p_val, method_name) # dobim slovar [att_name] --> (p value, method_name)
print(slovarek)
return slovarek
def google(ime_datoteke):
"""we're not currently using this anywhere. """
google = True
if google:
with open(f'{ime_datoteke}.csv') as f:
tabela_str = f.read()
tabela_str = tabela_str.replace(',', ':').replace('.', ',')
with open(f'{ime_datoteke}.txt', 'w') as f:
f.write(tabela_str)
print(f"za google zapisano v {ime_datoteke}.txt")
if __name__ == "__main__":
data = Table("C:\\Users\\irisc\\Documents\\FRI\\blaginja\\FRI-blaginja\\input data\\ranking_survey.pkl")
slovarcek_p_values = get_p_values_for_top_factors(data)
"""
rez = get_p_value(ranker_scores)
plot_dist(ranker_scores)
"""