-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodeling.py
297 lines (277 loc) · 11.5 KB
/
modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Data Analysis library
import pandas as pd
import matplotlib.pyplot as plt
# Machine Learning libraries
# model selection
from sklearn.model_selection import GridSearchCV
# classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
# helper preprocessing module
import prepare as pr
import test_string as ts
########### GLOBAL VARIABLES #########
test_string = ts.test_string
# random state seed for split and classification algorithms
seed = 42
# DataFrame to store the scores
scores = pd.DataFrame(columns=['model_name', 'train_score', 'validate_score', 'score_difference'])
# create sets and taget variables for modeling
X_train, X_validate, X_test, y_train, y_validate, y_test = pr.get_modeling_data()
# calculate a baseline
baseline = round(y_train.value_counts(normalize=True)[0], 2)
# some models don't accept text as a target
# create a map to digitize target variable
lang_map = {'Java':0, 'C#':1, 'JavaScript':2, 'Python':3}
y_train_numeric = y_train.map(lang_map)
y_validate_numeric = y_validate.map(lang_map)
y_test_numeric = y_test.map(lang_map)
############# FUNCTIONS TO RUN CLASSIFIERS #############
##### Decision Tree ########
def run_decision_tree(cv:int=5):
'''
Classifier: Decision Tree algorithm
Creates a dictionary of parameters for the classifier
Uses GridSearchCV to find the best combination of parameters
Prints the selected parameters on the screen
Fits the classifier with best parameters to the training set using GridSearch
Calculates accuracy scores for the training and validate sets and saves them into scores dataframe
-----------
Parameters:
cv: integer, number of cross validation folds for the grid search
No returns
'''
# create a dictionary of parameters
DTC_parameters = {'max_depth':[ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ]
}
# create a classifier
DTC = DecisionTreeClassifier(random_state=seed)
# creat a grid search
grid_DTC = GridSearchCV(estimator=DTC, param_grid=DTC_parameters, cv=cv, n_jobs=-1)
# fit on train set
grid_DTC.fit(X_train, y_train)
# print the best parameter's comination
print(f'Best parameters per algorithm:')
print('----------------------------------------------------')
print(f'Decision Tree Parameters: {grid_DTC.best_params_}')
# calculate scores
train_score = grid_DTC.best_estimator_.score(X_train, y_train)
validate_score = grid_DTC.best_estimator_.score(X_validate, y_validate)
# save the scores into a dataframe
scores.loc[len(scores)] = ['Decision Tree', train_score, validate_score, train_score - validate_score]
##### Random Forest ########
def run_random_forest(cv:int=5):
'''
Classifier: Random Forest algorithm
Creates a dictionary of parameters for the classifier
Uses GridSearchCV to find the best combination of parameters
Prints the selected parameters on the screen
Fits the classifier with best parameters to the training set using GridSearch
Calculates accuracy scores for the training and validate sets and saves them into scores dataframe
-----------
Parameters:
cv: integer, number of cross validation folds for the grid search
No returns
'''
# create a dictionary of parameters
rf_parameters = {'max_depth':[5, 6, 7],
'min_samples_leaf':[2, 3, 5]
}
# create a classifier
rf = RandomForestClassifier(random_state=seed)
# creat a grid search
grid_rf = GridSearchCV(estimator=rf, param_grid=rf_parameters, cv=cv, n_jobs=-1)
# fit on train set
grid_rf.fit(X_train, y_train)
# print the best parameter's comination
print(f'Random Forest Parameters: {grid_rf.best_params_}')
# calculate scores
train_score = grid_rf.best_estimator_.score(X_train, y_train)
validate_score = grid_rf.best_estimator_.score(X_validate, y_validate)
# save the scores into a dataframe
scores.loc[len(scores)] = ['Random Forest', train_score, validate_score, train_score - validate_score]
##### Logistic Regression and Gaussian NB ########
def run_other():
'''
Classifier #1: Logistic Regression
Classifier #2: Gaussian Naive Bayes
Creates Logistic Regression and Gaussian NB, fits them on the training set
Calculates accuracy scores for the training and validate sets and saves them into scores dataframe
-----------
No parameters
No returns
'''
##### STEP 1
# create Logistic Regression
lr = LogisticRegression(random_state=seed)
# fit on the train set
lr.fit(X_train, y_train)
# calculate scores
train_score_lr = lr.score(X_train, y_train)
validate_score_lr = lr.score(X_validate, y_validate)
# print params
print('Logistic Regression: default paramters')
# save the scores
scores.loc[len(scores)] = \
['Logistic Regression', train_score_lr, validate_score_lr, train_score_lr - validate_score_lr]
##### STEP 2
# create Gaussian NB
nb = GaussianNB()
# fit on the train set
nb.fit(X_train, y_train)
# calculate scores
train_score_nb = nb.score(X_train, y_train)
validate_score_nb = nb.score(X_validate, y_validate)
# print params
print('Gaussian NB parameters: default paramters')
# save the scores
scores.loc[len(scores)] = \
['Gaussian Naive Bayes', train_score_nb, validate_score_nb, train_score_nb - validate_score_nb]
##### Multinomial NB ########
def run_multinomial_nb(cv:int=3):
'''
Classifier: Multinomial Naive Bayes
Creates a dictionary of parameters for the classifier
Uses GridSearchCV to find the best combination of parameters
Prints the selected parameters on the screen
Fits the classifier with best parameters to the training set using GridSearch
Calculates accuracy scores for the training and validate sets and saves them into scores dataframe
-----------
Parameters:
cv: integer, number of cross validation folds for the grid search
No returns
'''
mnb_parameters = {'alpha': [0.2, 0.5, 1.0]}
mnb = MultinomialNB(alpha=0.2)
grid_mnb = GridSearchCV(estimator=mnb, param_grid=mnb_parameters, cv=cv, n_jobs=-1)
# fit on the train set
grid_mnb.fit(X_train, y_train)
# print parameters
print(f'Multinomial NB Parameters: {grid_mnb.best_params_}')
# calculate scores
train_score = grid_mnb.best_estimator_.score(X_train, y_train)
validate_score = grid_mnb.best_estimator_.score(X_validate, y_validate)
# add scores to the dataframe
scores.loc[len(scores)] = ['Multinomail Naive Bayes', train_score, validate_score, train_score - validate_score]
##### Gradient Boosting ########
def run_gradient_boosting(cv=3):
'''
Classifier: Gradient Boosting
Creates a dictionary of parameters for the classifier
Uses GridSearchCV to find the best combination of parameters
Prints the selected parameters on the screen
Fits the classifier with best parameters to the training set using GridSearch
Calculates accuracy scores for the training and validate sets and saves them into scores dataframe
-----------
Parameters:
cv: integer, number of cross validation folds for the grid search
No returns
'''
# create a dictionary of parameters
gb_parameters = {
'learning_rate': [0.1, 0.2, 0.3],
'n_estimators': [100, 150, 200],
'max_depth': [5, 6, 7],
}
# create a classifier
gb = GradientBoostingClassifier(random_state=seed)
# create a grid search
grid_gb = GridSearchCV(estimator=gb, param_grid=gb_parameters, cv=cv, n_jobs=-1)
# fit on train set
grid_gb.fit(X_train, y_train)
# print the best parameter's comination
print(f'Gradient Boosting Parameters: {grid_gb.best_params_}')
# calculate scores
train_score = grid_gb.best_estimator_.score(X_train, y_train)
validate_score = grid_gb.best_estimator_.score(X_validate, y_validate)
# save scores
scores.loc[len(scores)] = ['Gradient Boosting', train_score, validate_score, train_score - validate_score]
def run_xgboost(cv=3):
'''
Classifier: XGBoost
Creates a dictionary of parameters for the classifier
Uses GridSearchCV to find the best combination of parameters
Prints the selected parameters on the screen
Fits the classifier with best parameters to the training set using GridSearch
Calculates accuracy scores for the training and validate sets and saves them into scores dataframe
Prints Feature Importance bar chart
-----------
Parameters:
cv: integer, number of cross validation folds for the grid search
No returns
'''
# create parameters
xb_parameters = {
'max_depth': [3, 4, 5, 6],
'gamma': [0.1, 0.2, 0.3]
}
# create a classifier
xb = xgb.XGBClassifier(n_estimators=100,eval_metric='merror',seed=seed)
# create a grid search
grid_xb = GridSearchCV(estimator=xb, param_grid=xb_parameters, cv=cv, n_jobs=-1)
# fit on the train set
grid_xb.fit(X_train, y_train_numeric)
# print parameters
print(f'XGBoost Parameters: {grid_xb.best_params_}')
print()
# calculate scores
train_score = grid_xb.best_estimator_.score(X_train, y_train_numeric)
validate_score = grid_xb.best_estimator_.score(X_validate, y_validate_numeric)
# add scores to the dataframe
scores.loc[len(scores)] = ['XGBoost', train_score, validate_score, train_score - validate_score]
############# RUN ALL CLASSIFIERS #############
def run_all_classifiers() -> pd.DataFrame:
'''
Runs all classifiers.
No return values
'''
run_decision_tree()
run_random_forest()
run_other() # 2 classifiers
run_multinomial_nb()
run_gradient_boosting()
run_xgboost()
def display_scores():
'''
Displays scores
No return values
'''
display(scores.sort_values(by='score_difference'))
def display_feature_importance():
'''
Creates an XGBoost Classifier and uses its method
plot_importance to show the feature importance
'''
# create a braplot and display it
# we create the classifier again using the best parameters {'gamma': 0.2, 'max_depth': 4}
xb = xgb.XGBClassifier(gamma=0.2, max_depth=4 ,n_estimators=100,eval_metric='merror',seed=seed)
xb.fit(X_train, y_train_numeric)
xgb.plot_importance(xb, max_num_features=7)
plt.rcParams['figure.figsize'] = [10,7]
plt.show()
def run_best_model():
rf = RandomForestClassifier(max_depth=5, min_samples_leaf=2, random_state=seed)
rf.fit(X_train, y_train)
#calculate scores
train_score = round(rf.score(X_train, y_train), 2)
validate_score = round(rf.score(X_validate, y_validate), 2)
test_score = round(rf.score(X_test, y_test), 2)
yhat=rf.predict(X_test)
return pd.DataFrame({'result':['Random Forest', train_score, validate_score, test_score]},\
index=['Model name', 'Train score', 'Validate score', 'Test score'])
#####################################################
def predict_text(text:str):
'''
Creates a data Frame out of the text, prepares this data for predictions, predicts outcome
Parameters:
text: string with a text from README file
Returns:
str: predictions: JavaScript, Java, C# or Python
'''
to_predict = pr.vectorize_for_predictions(pr.get_additional_stopwords(), text=text)
rf = RandomForestClassifier(max_depth=5, min_samples_leaf=2, random_state=seed)
rf.fit(X_train, y_train)
return rf.predict(to_predict)