-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
341 lines (265 loc) · 15 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#モジュールのインポート
import streamlit as st
import streamlit.components.v1 as components
import numpy as np
import codecs
import pandas as pd
from matplotlib import pyplot as plt
import sweetviz as sv
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve, mean_squared_error, make_scorer, r2_score , mean_absolute_error,mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
from sklearn.model_selection import KFold
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
from base64 import b64encode
# タイトルを表示
st.set_page_config(layout="centered", page_title="線形モデル作成ツール")
st.title("GUI De Modeling")
st.markdown("created by Keisuke Kamata")
st.markdown("簡単に線形モデルを作成するためのツールです。データ加工はこのアプリに投入する前に行ってください。エラーが出た際は [email protected] までご連絡ください")
st.sidebar.markdown("### 1. データの読み込み")
uploaded_file = st.sidebar.file_uploader("CSVファイルをドラッグ&ドロップ、またはブラウザから選択してください", type='csv', key='train')
if uploaded_file is not None:
#データの読込み
df = pd.read_csv(uploaded_file)
df_0 = df
#object型をcategory型に変更
@st.cache
def change_categoryformat(df):
df_output = df
df_output.loc[:, df_output.dtypes == 'object'] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))
return df_output
df = change_categoryformat(df)
#ID・目的変数の選択
st.sidebar.markdown("### 2. ID・目的変数の選択")
target = st.sidebar.selectbox(
'目的変数を選択してください',
df.columns
)
id = st.sidebar.selectbox(
'IDを選択してください',
df.columns
)
#st.dataframe(df)
#st.markdown(df.shape)
#説明変数の選択
st.sidebar.markdown("### 3. 説明変数の選択")
unique_columns = df.drop(columns = [target]).columns.values
cols = st.sidebar.multiselect("",unique_columns,[])
#モデル構築
st.sidebar.markdown("### 4. モデル作成開始")
if cols is not None:
cv = st.sidebar.slider("交差検定数", min_value =1, max_value=50, value=10)
st.sidebar.write("※ホールドアウト用のデータが準備できる場合、別途準備してください")
if st.sidebar.button('モデル作成開始'):
# check process
if df[target].isnull().any() == "True":
st.write("errorです: 目的変数に欠損が含まれております。欠損が内容にデータを準備してください")
st.stop()
if df[target].dtypes != "float64" and df[target].dtypes != "int64":
try:
df[target] = df[target].astype("float64")
except:
st.write("errorです: 目的変数にカテゴリが含まれている可能性があります。データを確認し、再度データを取り込んでください")
st.stop()
df[target] = df[target].astype("float64")
for c in cols:
if df[c].dtypes != "float64" and df[c].dtypes != "int64":
try:
df[c] = df[c].astype("float64")
except:
st.write("errorです: ",c,"にカテゴリが含まれている可能性があります。データを確認して、再度データを取り込んでください")
st.stop()
df[c] = df[c].astype("float64")
with st.spinner('実行中...'):
@st.cache
def modeling(df,target,cols):
Y = df[target]
X = df[cols]
# 標準化する変数をリストアップ
standard_cols = []
for c in cols:
if len(df[c].unique()) > 2:
standard_cols.append(c)
# モデリング & 評価
folds = KFold(n_splits= cv, shuffle=True, random_state=0)
oof_preds = np.zeros(X.shape[0])
y_preds = []
y_vals = []
models = []
importance_list = []
perm_imp_list = []
val_inds = []
fold_label = np.zeros(X.shape[0])
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
mse_scorer = make_scorer(rmse)
for train_ind, val_ind in folds.split(X):
# partition
x_train, x_val = X.iloc[train_ind], X.iloc[val_ind]
y_train, y_val = Y.iloc[train_ind], Y.iloc[val_ind]
val_inds+=val_ind.tolist()
# missing value imputation (欠損flagも立てず、まずは中央値補完 <= もしデータの性質として欠損=0などがわかっている場合は、それを適用)
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.feature_names_in_ = cols
imp.fit(x_train)
x_train.loc[:] = imp.transform(x_train)
x_val.loc[:] = imp.transform(x_val)
# standardization / scaler
scaler = StandardScaler()
scaler.fit(x_train[standard_cols])
x_train.loc[:,standard_cols], x_val.loc[:,standard_cols] = scaler.transform(x_train[standard_cols]), scaler.transform(x_val[standard_cols])
# model
regr =ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=1000000000,random_state=0,normalize=False)
regr.fit(x_train, y_train)
# prediction
y_pred =regr.predict(x_val)
y_preds +=y_pred.tolist()
y_vals +=y_val.tolist()
# run permutation importance
result = permutation_importance(regr, x_train, y_train, scoring=mse_scorer, n_repeats=10, n_jobs=-1, random_state=71)
perm_imp_df = pd.DataFrame({"importances_mean":result["importances_mean"], "importances_std":result["importances_std"]}, index=X.columns)
perm_imp_list += [perm_imp_df]
Result = pd.DataFrame(columns=[id])
Result["id"] = df_0.iloc[val_inds][id].tolist()
Result["actual"] = df_0.iloc[val_inds][target].tolist()
Result["prediction"] = y_preds
Result["actual-prediction"] = Result["actual"] - Result["prediction"]
for col in cols:
Result[col] = df_0.iloc[val_inds][col].tolist()
return Result, y_vals,y_preds,perm_imp_list
Result,y_vals,y_preds,perm_imp_list= modeling(df,target,cols)
## evaluation
# metric
# st.markdown("#### 精度")
col1, col2, col3 = st.columns(3)
col1.metric(label="R2", value= round(r2_score(y_vals, y_preds),4))
col2.metric(label="MAE", value= mean_absolute_error(y_vals, y_preds))
col3.metric(label="MAPE", value= round(mean_absolute_percentage_error(y_vals, y_preds),4))
# permutation importance
# 参考: https://qiita.com/kenmatsu4/items/c49059f78c2b6fed0929
@st.cache
def permutation_importance_calc(df,cols,perm_imp_list):
permutation_imporatance_result = pd.DataFrame(columns = ["importances_mean"], index=df[cols].columns)
for col in df[cols].columns:
importance = 0
for i in range(0,5):
importance += perm_imp_list[i].loc[col,'importances_mean']
permutation_imporatance_result.loc[col,'importances_mean'] = importance/5
permutation_imporatance_result = permutation_imporatance_result*(-1)
permutation_imporatance_result.sort_values(by ='importances_mean', inplace=True, ascending=True)
return permutation_imporatance_result
permutation_imporatance_result = permutation_importance_calc(df,cols,perm_imp_list)
# figure
# 精度
combined_list = pd.concat([Result["actual"], Result["prediction"]])
reference_line = go.Scatter(x=[min(combined_list), max(combined_list)],
y=[min(combined_list), max(combined_list)],
mode='lines',line=dict(color='gray', width=1,dash='dot'),showlegend=False)
fig = px.scatter(Result, x="actual",y="prediction",hover_name="id")
#reference_line = go.Scatter(x=[min(Result["actual"].append(Result["prediction"])),max(Result["actual"].append(Result["prediction"]))], y=[min(Result["actual"].append(Result["prediction"])),max(Result["actual"].append(Result["prediction"]))],mode = 'lines',
# line=dict(color='gray', width=1,dash='dot'),showlegend=False)
fig.add_trace(reference_line)
layout = go.Layout(
title=dict(text='実測 vs 予測',),
title_x=0.5,
xaxis=dict(title='実測',),
yaxis=dict(title='予測',),
plot_bgcolor='white',
)
fig.update_xaxes(showline=True,
linewidth=1,
linecolor='lightgrey',
color='grey')
fig.update_yaxes(showline=True,
linewidth=1,
linecolor='lightgrey',
color='grey')
fig['layout'].update(layout)
st.plotly_chart(fig, use_container_width=True)
# 係数と変数の重要性
col1, col2 = st.columns([3,2])
# 変数の重要性
permutation_imporatance_result.sort_values(by ='importances_mean', inplace=True, ascending=True)
fig = go.Figure(go.Bar(
x = permutation_imporatance_result['importances_mean'],
y=permutation_imporatance_result.index.values,
orientation='h'))
layout = go.Layout(
plot_bgcolor='white',
)
fig['layout'].update(layout)
col1.write("変数の重要性 (Permutation importnace)")
col1.plotly_chart(fig, use_container_width=True)
# 全てのデータを使ってmodelingをし、係数を抽出
def modeling_all(df,target,cols):
Y = df[target].copy()
X = df.drop(columns = [target]).copy()
# 特徴量セット Final
X = X[cols]
# missing value imputation
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.feature_names_in_ = cols
imp.fit(X)
X.loc[:] = imp.transform(X)
# standadization / scaler
standard_cols = []
for c in cols:
if len(df[c].unique()) > 2:
standard_cols.append(c)
scaler = StandardScaler()
scaler.feature_names_in_ = standard_cols
scaler.fit(X[standard_cols])
X.loc[:,standard_cols] = scaler.transform(X[standard_cols])
# model
regr =ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=1000,random_state=0)
regr.fit(X, Y)
# coefficient & intercept
Result_coef = pd.DataFrame()
pred = 0
coefs = []
cols_input = cols
intercept = regr.intercept_
permutation_imporatance_result.sort_values(by ='importances_mean', inplace=True, ascending=False)
Result_coef = pd.DataFrame(columns = ["coefficient"], index=permutation_imporatance_result.index.values)
for col in cols:
index_0 = cols.index(col)
coef = regr.coef_[index_0]
if col in scaler.feature_names_in_:
index = scaler.feature_names_in_.tolist().index(col)
std = math.sqrt(scaler.var_[index])
mean = scaler.mean_[index]
intercept += (-1)*coef*mean/std
pred += df.loc[3,col]*coef/std
Result_coef.loc[col,"coefficient"] = coef/std
else:
Result_coef.loc[col,"coefficient"] = coef
Result_coef.loc["切片","coefficient"] = intercept
return Result_coef
Result_coef = modeling_all(df,target,cols)
col2.write("係数")
col2.dataframe(Result_coef)
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
csv = convert_df(Result_coef)
st.download_button(
label="Download 係数",
data=csv,
file_name='coeffient.csv',
mime='text/csv',
)
csv2 = convert_df(Result)
st.download_button(
label="Download 交差検定予測結果",
data=csv2,
file_name='prediction.csv',
mime='text/csv',
)